Пример #1
0
def train_vectors(options, sens = None):
    if not utils.check_rebuild(options.vectors_path, descrip='embedding vectors', always_rebuild=options.always_rebuild):
        return

    if options.model == 'DeepWalk' or options.model == 'Node2Vec':
        # if options.using_tensorflow:
        #     TF_skipgram.train_vectors(options, sens=sens)
        # else:
        skipgram.train_vectors(options, sens= sens)
    elif options.model == 'LINE':
        TF_line.train_vectors(options)
    elif options.model == 'RWNE':
        TF_rwne.train_vectors(options)
    elif options.model == 'SDNE':
        TF_sdne.train_vectors(options)
    elif options.model == 'DNGR':
        TF_dngr.train_vectors(options)
    elif options.model == 'GraRep':
        grarep.train_vectors(options)
    elif options.model == 'GCN':
        TF_gcn.train_vectors(options)
    else:
        logger.error("Unknown model for embedding: '%s'. "% options.model+
                     "Valid models: 'DeepWalk', 'Node2Vec', 'LINE', 'RWNE'.")
        sys.exit()
Пример #2
0
def build_walk_corpus_to_files(filebase,
                               walk_times,
                               headflag_of_index_file='',
                               max_num_workers=cpu_count(),
                               always_rebuild=False):
    if not utils.check_rebuild(
            filebase, descrip='walk corpus', always_rebuild=always_rebuild):
        return

    if max_num_workers <= 1 or walk_times <= 1:
        if max_num_workers > 1:
            logger.warning(
                'Corpus bulid: walk times too small, using single-process instead...'
            )
        files = []
        logger.info(
            'Corpus bulid: walking to files (without using multi-process)...')
        time_start = time.time()
        files.append(
            _construct_walk_corpus_and_write_singprocess(
                (filebase, walk_times)))
        logger.info('Corpus bulid: walk completed in {}s'.format(time.time() -
                                                                 time_start))
        return files
    else:
        return _construct_walk_corpus_and_write_multiprocess(
            filebase,
            walk_times,
            headflag_of_index_file=headflag_of_index_file,
            max_num_workers=max_num_workers)
Пример #3
0
def store_walk_corpus(filebase, walk_sens, always_rebuild=False):
    if not utils.check_rebuild(
            filebase, descrip='walk corpus', always_rebuild=always_rebuild):
        return
    logger.info('Corpus store: storing...')
    time_start = time.time()
    with open(filebase, 'w') as fout:
        for sen in walk_sens:
            for v in sen:
                fout.write(u"{} ".format(str(v)))
            fout.write('\n')
    logger.info('Corpus store: store completed in {}s'.format(time.time() -
                                                              time_start))
    return
Пример #4
0
def train_vectors(options, sens=None):
    if not utils.check_rebuild(options.vectors_path,
                               descrip='embedding vectors',
                               always_rebuild=options.always_rebuild):
        return

    if options.model == 'DeepWalk':
        skipgram.train_vectors(options, sens=sens)
    elif options.model == 'LINE':
        TF_line.train_vectors(options)
    elif options.model == 'PTE':
        TF_pte.train_vectors(options)
    elif options.model == "SpaceyWalk":
        TF_spaceywalk.train_vectors(options)
    else:
        logger.error("Unknown model for embedding: '%s'. " % options.model +
                     "Valid models: 'DeepWalk', 'LINE', 'PTE', 'SpaceyWalk'.")
        sys.exit()
Пример #5
0
def train_vectors(options):
    if not utils.check_rebuild(options.vectors_path,
                               descrip='vectors',
                               always_rebuild=options.always_rebuild):
        return
    train_vec_dir = os.path.split(options.vectors_path)[0]
    if not os.path.exists(train_vec_dir):
        os.makedirs(train_vec_dir)

    # construct network
    net = network.construct_network(options)

    Kstep = 2

    # train info
    logger.info('Train info:')
    logger.info('\t train_model = {}'.format(options.model))
    logger.info('\t total embedding nodes = {}'.format(net.get_nodes_size()))
    logger.info('\t total edges = {}'.format(net.get_edges_size()))
    logger.info('\t embedding size = {}'.format(options.embedding_size))
    logger.info('\t Kstep = {}'.format(Kstep))
    logger.info('\t vectors_path = {}'.format(options.vectors_path))

    fr_vec = open(os.path.join(train_vec_dir, 'embedding.info'), 'w')
    fr_vec.write('embedding info:\n')
    fr_vec.write('\t train_model = {}\n'.format(options.model))
    fr_vec.write('\t total embedding nodes = {}\n'.format(
        net.get_nodes_size()))
    fr_vec.write('\t total edges = {}\n'.format(net.get_edges_size()))
    fr_vec.write('\t embedding size = {}\n'.format(options.embedding_size))
    fr_vec.write('\t Kstep = {}\n'.format(Kstep))
    fr_vec.write('\t vectors_path = {}\n'.format(options.vectors_path))
    fr_vec.close()

    # train
    logger.info('training...')
    time_start = time.time()
    grarep = GraRep(net.get_nodes_size(), net.edges, options.embedding_size,
                    Kstep)
    vecs = grarep.train()
    save_word2vec_format(options.vectors_path, vecs, net._idx_nodes)
    logger.info('train completed in {}s'.format(time.time() - time_start))
    return
Пример #6
0
def eval_once(options):
    global features_matrix, labels_matrix, LABEL_SIZE
    if not utils.check_rebuild(options.cluster_path,
                               descrip='cluster',
                               always_rebuild=options.always_rebuild):
        return
    logger.info('eval case: cluster...')
    logger.info('\t save_path: {}'.format(options.cluster_path))
    logger.info('\t cluster: kmeans')
    logger.info('\t multilabel_rule: {}'.format(options.multilabel_rule))
    logger.info('\t eval_online: {}'.format(options.eval_online))
    logger.info('\t eval_workers: {}'.format(options.eval_workers))
    logger.info('\t repeat {} times'.format(options.repeated_times))

    logger.info('\t reading labeled data from file {}'.format(
        options.label_path))
    time_start = time.time()
    id_list, labels_list = utils.get_labeled_data(
        options.label_path, multilabel_rule=options.multilabel_rule)
    features_matrix, labels_list = utils.get_vectors(
        utils.get_KeyedVectors(options.vectors_path), id_list, labels_list)
    labels_matrix = np.array([item[0] for item in labels_list])
    LABEL_SIZE = options.label_size
    logger.info('\t reading labeled data completed in {}s'.format(time.time() -
                                                                  time_start))
    logger.info('\t total labeled data size: {}'.format(
        np.size(features_matrix, axis=0)))
    logger.info('\t total labels size: {}'.format(options.label_size))

    # cluster
    fr = open(options.cluster_path, 'w')
    fr.write('eval case: cluster...\n')
    fr.write('\t save_path: {}\n'.format(options.cluster_path))
    fr.write('\t cluster: kmeans\n')
    fr.write('\t multilabel_rule: {}\n'.format(options.multilabel_rule))
    fr.write('\t eval_online: {}\n'.format(options.eval_online))
    fr.write('\t eval_workers: {}\n'.format(options.eval_workers))
    fr.write('\t repeat {} times\n'.format(options.repeated_times))
    fr.write('\t total labeled data size: {}\n'.format(
        np.size(features_matrix, axis=0)))
    fr.write('\t total labels size: {}\n'.format(options.label_size))
    for i in range(options.label_size):
        fr.write('\t\t label {}: {}\n'.format(i, np.sum(labels_matrix == i)))

    if options.eval_workers > 1 and options.repeated_times > 1:
        # speed up by using multi-process
        logger.info("\t allocating repeat_times to workers ...")
        if options.repeated_times <= options.eval_workers:
            times_per_worker = [1 for _ in range(options.repeated_times)]
        else:
            div, mod = divmod(options.repeated_times, options.eval_workers)
            times_per_worker = [div for _ in range(options.eval_workers)]
            for idx in range(mod):
                times_per_worker[idx] = times_per_worker[idx] + 1
        assert sum(
            times_per_worker
        ) == options.repeated_times, 'workers allocating failed: %d != %d' % (
            sum(times_per_worker), options.repeated_times)

        logger.info("\t using {} processes for evaling:".format(
            len(times_per_worker)))
        for idx, rep_times in enumerate(times_per_worker):
            logger.info("\t process-{}: repeat {} times".format(
                idx, rep_times))

        try:
            nmi_list = []  # (train_ratio, macro, micro)
            with ProcessPoolExecutor(
                    max_workers=options.eval_workers) as executor:
                for ret in executor.map(_cluster_thread_body,
                                        times_per_worker):
                    nmi_list.extend(ret)
        except:
            nmi_list = []  # (train_ratio, macro, micro)
            with ProcessPoolExecutor(
                    max_workers=options.eval_workers) as executor:
                for ret in executor.map(_cluster_thread_body,
                                        times_per_worker):
                    nmi_list.extend(ret)

        if len(nmi_list) != options.repeated_times:
            logger.warning(
                "warning: eval unmatched repeated_times: {} != {}".format(
                    len(nmi_list), options.repeated_times))
    else:
        try:
            nmi_list = _cluster_thread_body(options.repeated_times)
        except:
            nmi_list = _cluster_thread_body(options.repeated_times)

    mean_nmi = sum(nmi_list) / float(len(nmi_list))
    fr.write(
        'expected repeated_times: {}, actual repeated_times: {}, mean results as follows:\n'
        .format(options.repeated_times, len(nmi_list)))
    fr.write('\t\t NMI = {}\n'.format(mean_nmi))
    fr.write('details:\n')
    for repeat in range(len(nmi_list)):
        fr.write('\t repeated {}/{}: NMI = {}\n'.format(
            repeat + 1, len(nmi_list), nmi_list[repeat]))
    fr.write('\neval case: cluster completed in {}s.'.format(time.time() -
                                                             time_start))
    fr.close()
    logger.info('eval case: cluster completed in {}s.'.format(time.time() -
                                                              time_start))

    return
Пример #7
0
def eval_online(options):
    global features_matrix, labels_matrix, LABEL_SIZE
    cluster_dir = os.path.split(options.cluster_path)[0]
    if not utils.check_rebuild(cluster_dir,
                               descrip='cluster',
                               always_rebuild=options.always_rebuild):
        return
    if not os.path.exists(cluster_dir):
        os.makedirs(cluster_dir)

    logger.info('eval case: cluster...')
    logger.info('\t save_path: {}'.format(options.cluster_path))
    logger.info('\t cluster: kmeans')
    logger.info('\t multilabel_rule: {}'.format(options.multilabel_rule))
    logger.info('\t eval_online: {}'.format(options.eval_online))
    logger.info('\t eval_interval: {}s'.format(options.eval_interval))
    logger.info('\t eval_workers: {}'.format(options.eval_workers))
    logger.info('\t repeat {} times'.format(options.repeated_times))
    logger.info('\t total labels size: {}'.format(options.label_size))

    if options.eval_workers > 1 and options.repeated_times > 1:
        # speed up by using multi-process
        logger.info("\t allocating repeat_times to workers ...")
        if options.repeated_times <= options.eval_workers:
            times_per_worker = [1 for _ in range(options.repeated_times)]
        else:
            div, mod = divmod(options.repeated_times, options.eval_workers)
            times_per_worker = [div for _ in range(options.eval_workers)]
            for idx in range(mod):
                times_per_worker[idx] = times_per_worker[idx] + 1
        assert sum(
            times_per_worker
        ) == options.repeated_times, 'workers allocating failed: %d != %d' % (
            sum(times_per_worker), options.repeated_times)

        logger.info("\t using {} processes for evaling:".format(
            len(times_per_worker)))
        for idx, rep_times in enumerate(times_per_worker):
            logger.info("\t process-{}: repeat {} times".format(
                idx, rep_times))

    fr_total = open(options.cluster_path, 'w')
    fr_total.write('eval case: cluster...\n')
    fr_total.write('\t save_dir: {}\n'.format(cluster_dir))
    fr_total.write('\t cluster: kmeans\n')
    fr_total.write('\t multilabel_rule: {}\n'.format(options.multilabel_rule))
    fr_total.write('\t eval_online: {}\n'.format(options.eval_online))
    fr_total.write('\t eval_interval: {}s\n'.format(options.eval_interval))
    fr_total.write('\t eval_workers: {}\n'.format(options.eval_workers))
    fr_total.write('\t repeat {} times\n'.format(options.repeated_times))
    fr_total.write('\t total labels size: {}\n'.format(options.label_size))
    fr_total.write(
        '\t results(NMI):\n=============================================================\n'
    )
    fr_total.write('finish_time\tckpt\tNMI\n')

    logger.info('\t reading labeled data from file {}'.format(
        options.label_path))
    time_start = time.time()
    id_list, labels_list = utils.get_labeled_data(
        options.label_path, multilabel_rule=options.multilabel_rule)
    logger.info('\t reading labeled data completed in {}s'.format(time.time() -
                                                                  time_start))

    last_step = 0
    summary_writer = tf.summary.FileWriter(cluster_dir, tf.Graph())
    summary = tf.Summary()
    summary.value.add(tag='nmi', simple_value=0.)
    summary_writer.add_summary(summary, last_step)

    best_nmi = 0

    ckpt_dir = os.path.join(os.path.split(options.vectors_path)[0], 'ckpt')
    ckpt = tf.train.get_checkpoint_state(ckpt_dir)
    while (not (ckpt and ckpt.model_checkpoint_path)):
        logger.info("\t model and vectors not exist, waiting ...")
        time.sleep(options.eval_interval)
        ckpt = tf.train.get_checkpoint_state(ckpt_dir)

    reading = options.vectors_path + ".reading_cluster"
    writing = options.vectors_path + ".writing"
    while (options.eval_online):
        while True:
            ckpt = tf.train.get_checkpoint_state(ckpt_dir)
            cur_step = int(
                ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
            if cur_step <= last_step or (not os.path.exists(
                    options.vectors_path)) or os.path.exists(writing):
                if os.path.exists(
                        os.path.join(
                            os.path.split(options.vectors_path)[0],
                            "RUN_SUCCESS")):
                    return
                time.sleep(options.eval_interval)
                continue
            # ready for reading
            logger.info("\t declare for reading ...")
            open(reading, "w")  # declare
            time.sleep(30)
            ckpt = tf.train.get_checkpoint_state(ckpt_dir)
            cur_step = int(
                ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
            if cur_step <= last_step or (not os.path.exists(
                    options.vectors_path)) or os.path.exists(writing):
                os.remove(reading)  # undeclare
                logger.info("\t confliction! undeclare and waiting ...")
                time.sleep(options.eval_interval)
                continue

            break
        logger.info("\t eval ckpt-{}.......".format(cur_step))
        time_start = time.time()
        logger.info('\t reading embedding vectors from file {}'.format(
            options.vectors_path))
        features_matrix, labels_list = utils.get_vectors(
            utils.get_KeyedVectors(options.vectors_path), id_list, labels_list)
        os.remove(reading)  # synchrolock for multi-process
        logger.info("\t done for reading ...")
        labels_matrix = np.array([item[0] for item in labels_list])
        LABEL_SIZE = options.label_size
        logger.info(
            '\t reading labeled data completed in {}s'.format(time.time() -
                                                              time_start))
        logger.info('\t total labeled data size: {}'.format(
            np.size(features_matrix, axis=0)))
        logger.info('\t total labels size: {}'.format(options.label_size))

        # cluster
        fr = open(options.cluster_path + '.{}'.format(cur_step), 'w')
        fr.write('eval case: cluster...\n')
        fr.write('\t cluster: kmeans\n')
        fr.write('\t multilabel_rule: {}\n'.format(options.multilabel_rule))
        fr.write('\t eval_workers: {}\n'.format(options.eval_workers))
        fr.write('\t repeat {} times\n'.format(options.repeated_times))
        fr.write('\t total labeled data size: {}\n'.format(
            np.size(features_matrix, axis=0)))
        fr.write('\t total labels size: {}\n'.format(options.label_size))
        for i in range(options.label_size):
            fr.write('\t\t label {}: {}\n'.format(i,
                                                  np.sum(labels_matrix == i)))

        if options.eval_workers > 1 and options.repeated_times > 1:
            # speed up by using multi-process
            fr.write("\t using {} processes for evaling:\n".format(
                len(times_per_worker)))
            for idx, rep_times in enumerate(times_per_worker):
                fr.write("\t process-{}: repeat {} times\n".format(
                    idx, rep_times))

            try:
                nmi_list = []
                with ProcessPoolExecutor(
                        max_workers=options.eval_workers) as executor:
                    for ret in executor.map(_cluster_thread_body,
                                            times_per_worker):
                        nmi_list.extend(ret)
            except:
                nmi_list = []
                with ProcessPoolExecutor(
                        max_workers=options.eval_workers) as executor:
                    for ret in executor.map(_cluster_thread_body,
                                            times_per_worker):
                        nmi_list.extend(ret)
            if len(nmi_list) != options.repeated_times:
                logger.warning(
                    "warning: eval unmatched repeated_times: {} != {}".format(
                        len(nmi_list), options.repeated_times))
        else:
            try:
                nmi_list = _cluster_thread_body(options.repeated_times)
            except:
                nmi_list = _cluster_thread_body(options.repeated_times)

        fr_total.write('%s ckpt-%-9d: ' % (time.strftime(
            '%Y-%m-%d %H:%M:%S', time.localtime(time.time())), cur_step))
        summary = tf.Summary()

        mean_nmi = sum(nmi_list) / float(len(nmi_list))
        fr.write(
            'expected repeated_times: {}, actual repeated_times: {}, mean results as follows:\n'
            .format(options.repeated_times, len(nmi_list)))
        fr.write('\t\t NMI = {}\n'.format(mean_nmi))
        fr.write('details:\n')
        for repeat in range(len(nmi_list)):
            fr.write('\t repeated {}/{}: NMI = {}\n'.format(
                repeat + 1, len(nmi_list), nmi_list[repeat]))
        fr.write('\neval case: cluster completed in {}s\n'.format(time.time() -
                                                                  time_start))
        fr.close()

        # fr_total.write('%.4f\n' % mean_nmi)
        fr_total.write('{}\n'.format(mean_nmi))
        fr_total.flush()
        summary.value.add(tag='nmi', simple_value=mean_nmi)
        summary_writer.add_summary(summary, cur_step)
        summary_writer.flush()
        logger.info(
            'cluster completed in {}s\n================================='.
            format(time.time() - time_start))

        # copy ckpt-files according to last mean_Micro_F1 (0.9 ratio).
        if mean_nmi > best_nmi:
            best_nmi = mean_nmi

            ckptIsExists = os.path.exists(
                os.path.join(ckpt_dir, 'model.ckpt-%d.index' % cur_step))
            if ckptIsExists:
                fr_best = open(os.path.join(cluster_dir, 'best_ckpt.info'),
                               'w')
            else:
                fr_best = open(os.path.join(cluster_dir, 'best_ckpt.info'),
                               'a')
                fr_best.write(
                    "Note:the model.ckpt-best is the remainings of last best_ckpt!\n"
                    "the current best_ckpt model is loss, but the result is:\n"
                )
            fr_best.write("best_nmi: {}\n".format(best_nmi))
            fr_best.write("best_ckpt: ckpt-{}\n".format(cur_step))
            fr_best.close()

            if ckptIsExists:
                sourceFile = os.path.join(
                    ckpt_dir, 'model.ckpt-%d.data-00000-of-00001' % cur_step)
                targetFile = os.path.join(
                    cluster_dir, 'model.ckpt-best.data-00000-of-00001')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)
                sourceFile = os.path.join(ckpt_dir,
                                          'model.ckpt-%d.index' % cur_step)
                targetFile = os.path.join(cluster_dir, 'model.ckpt-best.index')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)
                sourceFile = os.path.join(ckpt_dir,
                                          'model.ckpt-%d.meta' % cur_step)
                targetFile = os.path.join(cluster_dir, 'model.ckpt-best.meta')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)

        last_step = cur_step

    fr_total.close()
    summary_writer.close()

    return
Пример #8
0
def eval_online(options, net):
    global features_dict, true_edges_list_by_repeat, neg_edges_list_by_repeat
    link_prediction_dir = os.path.split(options.link_prediction_path)[0]
    if not utils.check_rebuild(link_prediction_dir,
                               descrip='link_prediction',
                               always_rebuild=options.always_rebuild):
        return
    if not os.path.exists(link_prediction_dir):
        os.makedirs(link_prediction_dir)
    logger.info('eval case: link_prediction ...')
    logger.info('\t data_dir = {}'.format(options.data_dir))
    logger.info('\t data_name = {}'.format(options.data_name))
    logger.info('\t isdirected = {}'.format(options.isdirected))
    logger.info('\t eval_edge_type: {}'.format(options.eval_edge_type))
    logger.info('\t save_dir: {}\n'.format(link_prediction_dir))
    logger.info('\t classifier: LogisticRegression')
    logger.info('\t eval_online: {}'.format(options.eval_online))
    logger.info('\t eval_interval: {}s'.format(options.eval_interval))
    logger.info('\t eval_workers: {}'.format(options.eval_workers))
    logger.info('\t repeated_times: {}'.format(options.repeated_times))
    logger.info('\t feature_operators: {}'.format(options.feature_operators))
    logger.info('\t sample_size: {}'.format(options.sample_size))

    time_start = time.time()

    # load_features(options, net)
    load_edges(options, net)

    logger.info('\t total true edges size: {}'.format(
        len(true_edges_list_by_repeat[0])))
    logger.info('\t total neg edges size: {}'.format(
        len(neg_edges_list_by_repeat[0])))

    # repeated 10times
    repeated_times = options.repeated_times
    # split ratio
    if options.train_ratio > 0:
        train_ratio_list = [options.train_ratio]
    else:
        train_ratio_list = [0.01, 0.05] + [v / 10.0 for v in range(1, 10)]

    logger.info('\t repeat {} times for each train_ratio in {}'.format(
        repeated_times, train_ratio_list))

    fr_total = open(options.link_prediction_path, 'w')
    fr_total.write('eval case: link_prediction...\n')
    fr_total.write('\t data_dir = {}\n'.format(options.data_dir))
    fr_total.write('\t data_name = {}\n'.format(options.data_name))
    fr_total.write('\t isdirected = {}\n'.format(options.isdirected))
    fr_total.write('\t eval_edge_type: {}\n'.format(options.eval_edge_type))
    fr_total.write('\t save_dir: {}\n\n'.format(link_prediction_dir))
    fr_total.write('\t classifier: LogisticRegression\n')
    fr_total.write('\t eval_online: {}\n'.format(options.eval_online))
    fr_total.write('\t eval_interval: {}s\n'.format(options.eval_interval))
    fr_total.write('\t eval_workers: {}\n'.format(options.eval_workers))
    fr_total.write('\t feature_operators: {}\n'.format(
        options.feature_operators))
    fr_total.write('\t repeated_times: {}\n'.format(options.repeated_times))
    fr_total.write('\t sample_size: {}\n'.format(options.sample_size))
    fr_total.write('\t total true edges size: {}\n'.format(
        len(true_edges_list_by_repeat[0])))
    fr_total.write('\t total neg edges size: {}\n'.format(
        len(neg_edges_list_by_repeat[0])))
    fr_total.write('\t repeat {} times for each train_ratio in {}\n'.format(
        repeated_times, train_ratio_list))
    fr_total.write(
        '\t results(AUC):\n=============================================================\n'
    )
    tmp_str = ""
    for train_ratio in train_ratio_list:
        for op in options.feature_operators:
            tmp_str = tmp_str + "\t{}({})".format(train_ratio, op)
    fr_total.write('finish_time\tckpt\t' + tmp_str + "\n")

    full_train_ratio_info_list = []
    for repeat in range(repeated_times):
        for op in options.feature_operators:
            for train_ratio in train_ratio_list:
                full_train_ratio_info_list.append((repeat, op, train_ratio))
    if options.eval_workers > 1 and len(full_train_ratio_info_list) > 1:
        # speed up by using multi-process
        if len(full_train_ratio_info_list) <= options.eval_workers:
            train_ratios_per_worker = [[
                train_ratio_info
            ] for train_ratio_info in full_train_ratio_info_list]
        else:
            div, mod = divmod(len(full_train_ratio_info_list),
                              options.eval_workers)
            train_ratios_per_worker = [
                full_train_ratio_info_list[div * i:div * (i + 1)]
                for i in range(options.eval_workers)
            ]
            for idx, train_ratio_info in enumerate(
                    full_train_ratio_info_list[div * options.eval_workers:]):
                train_ratios_per_worker[idx].append(train_ratio_info)
        logger.info("\t using {} processes for evaling:".format(
            len(train_ratios_per_worker)))
        for idx, train_ratios in enumerate(train_ratios_per_worker):
            logger.info("\t process-{}: {}".format(idx, train_ratios))

    last_step = 0
    summary_writer = tf.summary.FileWriter(link_prediction_dir, tf.Graph())
    summary = tf.Summary()
    for train_ratio in train_ratio_list:
        for op in options.feature_operators:
            summary.value.add(tag='auc_{}_{}'.format(train_ratio, op),
                              simple_value=0.)
    summary_writer.add_summary(summary, last_step)

    best_auc = 0

    ckpt_dir = os.path.join(os.path.split(options.vectors_path)[0], 'ckpt')
    ckpt = tf.train.get_checkpoint_state(ckpt_dir)
    while (not (ckpt and ckpt.model_checkpoint_path)):
        logger.info("\t model and vectors not exist, waiting ...")
        time.sleep(options.eval_interval)
        ckpt = tf.train.get_checkpoint_state(ckpt_dir)

    reading = options.vectors_path + ".reading_link_prediction_{}_{}".format(
        options.eval_edge_type[0], options.eval_edge_type[1])
    writing = options.vectors_path + ".writing"
    while (options.eval_online):
        while True:
            ckpt = tf.train.get_checkpoint_state(ckpt_dir)
            cur_step = int(
                ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
            ## synchrolock for multi-process:
            # while(not(cur_step > last_step and os.path.exists(options.vectors_path) and
            #                       time.time() - os.stat(options.vectors_path).st_mtime > 200)):
            #     time.sleep(options.eval_interval)
            #     ckpt = tf.train.get_checkpoint_state(ckpt_dir)
            #     cur_step = int(ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
            # os.utime(options.vectors_path, None)
            if cur_step <= last_step or (not os.path.exists(
                    options.vectors_path)) or os.path.exists(writing):
                if os.path.exists(
                        os.path.join(
                            os.path.split(options.vectors_path)[0],
                            "RUN_SUCCESS")):
                    return
                time.sleep(options.eval_interval)
                continue
            # ready for reading
            logger.info("\t declare for reading ...")
            open(reading, "w")  # declare
            time.sleep(30)
            ckpt = tf.train.get_checkpoint_state(ckpt_dir)
            cur_step = int(
                ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
            if cur_step <= last_step or (not os.path.exists(
                    options.vectors_path)) or os.path.exists(writing):
                os.remove(reading)  # undeclare
                logger.info("\t confliction! undeclare and waiting ...")
                time.sleep(options.eval_interval)
                continue

            break
        logger.info("\t eval ckpt-{}.......".format(cur_step))
        time_start = time.time()

        # loading features_matrix(already trained)
        load_features(options, net)

        os.remove(reading)  # synchrolock for multi-process
        logger.info("\t done for reading ...")

        logger.info('\t eval_workers: {}'.format(options.eval_workers))
        logger.info('\t repeated_times: {}'.format(options.repeated_times))
        logger.info('\t feature_operators: {}'.format(
            options.feature_operators))
        logger.info('\t sample_size: {}'.format(options.sample_size))
        logger.info('\t repeat {} times for each train_ratio in {}'.format(
            repeated_times, train_ratio_list))
        logger.info('\t total true edges size: {}'.format(
            len(true_edges_list_by_repeat[0])))
        logger.info('\t total neg edges size: {}'.format(
            len(neg_edges_list_by_repeat[0])))

        fr = open(options.link_prediction_path + '.{}'.format(cur_step), 'w')
        fr.write('eval case: link_prediction ...\n')
        fr.write('\t data_dir = {}\n'.format(options.data_dir))
        fr.write('\t data_name = {}\n'.format(options.data_name))
        fr.write('\t isdirected = {}\n'.format(options.isdirected))
        fr.write('\t eval_edge_type: {}\n'.format(options.eval_edge_type))
        fr.write('\t classifier: LogisticRegression\n')
        fr.write('\t eval_online: {}\n'.format(options.eval_online))
        fr.write('\t eval_workers: {}\n'.format(options.eval_workers))
        fr.write('\t feature_operators: {}\n'.format(
            options.feature_operators))
        fr.write('\t sample_size: {}\n'.format(options.sample_size))
        fr.write('\t repeated_times: {}\n'.format(options.repeated_times))
        fr.write('\t total true edges size: {}\n'.format(
            len(true_edges_list_by_repeat[0])))
        fr.write('\t total neg edges size: {}\n'.format(
            len(neg_edges_list_by_repeat[0])))
        fr.write('\t repeat {} times for each train_ratio in {}\n'.format(
            repeated_times, train_ratio_list))

        if options.eval_workers > 1 and len(full_train_ratio_info_list) > 1:
            fr.write("\t using {} processes for evaling:\n".format(
                len(train_ratios_per_worker)))
            for idx, train_ratios in enumerate(train_ratios_per_worker):
                fr.write("\t process-{}: {}\n".format(idx, train_ratios))

            try:
                ret_list = []  # (train_ratio, op, auc)
                with ProcessPoolExecutor(
                        max_workers=options.eval_workers) as executor:
                    for ret in executor.map(_classify_thread_body,
                                            train_ratios_per_worker):
                        ret_list.extend(ret)
            except:
                logger.warning("concurrent.futures.process failed, retry...")
                time.sleep(10)
                ret_list = []  # (train_ratio, op, auc)
                with ProcessPoolExecutor(
                        max_workers=options.eval_workers) as executor:
                    for ret in executor.map(_classify_thread_body,
                                            train_ratios_per_worker):
                        ret_list.extend(ret)

        else:
            ret_list = _classify_thread_body(full_train_ratio_info_list)

        fr_total.write('%s ckpt-%-9d: ' % (time.strftime(
            '%Y-%m-%d %H:%M:%S', time.localtime(time.time())), cur_step))
        summary = tf.Summary()

        ret_dict = {}
        for train_ratio, op, auc in ret_list:  # ret: (train_ratio, op, auc)
            if (train_ratio, op) in ret_dict:
                ret_dict[(train_ratio, op)].append(auc)
            else:
                ret_dict[(train_ratio, op)] = [auc]

        for train_ratio in train_ratio_list:
            for op in options.feature_operators:
                fr.write('\n' + '-' * 20 + '\n' +
                         'train_ratio = {}, operator = {}\n'.format(
                             train_ratio, op))
                auc_list = ret_dict[(train_ratio, op)]
                if len(auc_list) != repeated_times:
                    logger.warning(
                        "warning: train_ratio={},operator={},, eval unmatched repeated_times: {} != {}"
                        .format(train_ratio, op, len(auc_list),
                                repeated_times))
                mean_auc = sum(auc_list) / float(len(auc_list))
                fr.write(
                    'expected repeated_times: {}, actual repeated_times: {}, mean results as follows:\n'
                    .format(repeated_times, len(auc_list)))
                fr.write('\t\t AUC = {}\n'.format(mean_auc))
                fr.write('details:\n')
                for repeat in range(len(auc_list)):
                    fr.write('\t repeated {}/{}: AUC = {}\n'.format(
                        repeat + 1, len(auc_list), auc_list[repeat]))
                fr_total.write('%.4f    ' % (mean_auc))
                summary.value.add(tag='auc_{}_{}'.format(train_ratio, op),
                                  simple_value=mean_auc)
        fr.write('\n eval case: link_prediction completed in {}s\n'.format(
            time.time() - time_start))
        fr.close()
        fr_total.write('\n')
        fr_total.flush()
        summary_writer.add_summary(summary, cur_step)
        summary_writer.flush()
        logger.info(
            'link_prediction completed in {}s\n================================='
            .format(time.time() - time_start))

        cur_auc = np.mean(ret_dict[(train_ratio_list[-1],
                                    options.feature_operators[-1])])
        # copy ckpt-files according to last mean_Micro_F1 (0.9 ratio).
        if cur_auc > best_auc:
            best_auc = cur_auc

            ckptIsExists = os.path.exists(
                os.path.join(ckpt_dir, 'model.ckpt-%d.index' % cur_step))
            if ckptIsExists:
                fr_best = open(
                    os.path.join(link_prediction_dir, 'best_ckpt.info'), 'w')
            else:
                fr_best = open(
                    os.path.join(link_prediction_dir, 'best_ckpt.info'), 'a')
                fr_best.write(
                    "Note:the model.ckpt-best is the remainings of last best_ckpt!\n"
                    "the current best_ckpt model is loss, but the result is:\n"
                )
            fr_best.write(
                "best_auc(for train_ratio {} and operator {}): {}\n".format(
                    train_ratio_list[-1], options.feature_operators[-1],
                    best_auc))
            fr_best.write("best_ckpt: ckpt-{}\n".format(cur_step))
            fr_best.close()

            if ckptIsExists:
                sourceFile = os.path.join(
                    ckpt_dir, 'model.ckpt-%d.data-00000-of-00001' % cur_step)
                targetFile = os.path.join(
                    link_prediction_dir, 'model.ckpt-best.data-00000-of-00001')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)
                sourceFile = os.path.join(ckpt_dir,
                                          'model.ckpt-%d.index' % cur_step)
                targetFile = os.path.join(link_prediction_dir,
                                          'model.ckpt-best.index')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)
                sourceFile = os.path.join(ckpt_dir,
                                          'model.ckpt-%d.meta' % cur_step)
                targetFile = os.path.join(link_prediction_dir,
                                          'model.ckpt-best.meta')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)

        last_step = cur_step

    fr_total.close()
    summary_writer.close()
Пример #9
0
def eval_once(options):
    global features_matrix, net_eval, net_except, SAMPLE_NODES, SAMPLE_RULE, METIRC, PREC_K
    if not utils.check_rebuild(options.link_prediction_path,
                               descrip='link_prediction',
                               always_rebuild=options.always_rebuild):
        return

    logger.info('eval case: link-prediction ...')
    logger.info('\t save_path: {}'.format(options.link_prediction_path))
    logger.info('\t eval_data_path: {}'.format(options.eval_data_path))
    logger.info('\t except_data_path: {}'.format(options.except_data_path))
    logger.info('\t data_format: {}'.format(options.data_format))
    logger.info('\t metrics: MAP and precise@K')
    logger.info('\t max_index for precise@K: {}'.format(
        options.precK_max_index))
    logger.info('\t similarity_metric: {}'.format(options.similarity_metric))
    logger.info('\t eval_online: {}'.format(options.eval_online))
    logger.info('\t eval_interval: {}s'.format(options.eval_interval))
    logger.info('\t sample_nodes: {}'.format(options.sample_nodes))
    logger.info('\t sample_nodes_rule: {}'.format(options.sample_nodes_rule))
    logger.info('\t repeat {} times'.format(options.repeated_times))
    logger.info('\t eval_workers: {}'.format(options.eval_workers))

    logger.info("constructing eval network ...")
    net_eval = network.construct_network(data_path=options.eval_data_path,
                                         data_format=options.data_format,
                                         print_net_info=False,
                                         isdirected=options.isdirected)
    eval_net_nodes_size = net_eval.get_nodes_size()
    eval_net_edges_size = net_eval.get_edges_size()
    logger.info("eval_net_nodes_size = {}".format(eval_net_nodes_size))
    logger.info("eval_net_edges_size = {}".format(eval_net_edges_size))

    logger.info("constructing except(train) network ...")
    net_except = network.construct_network(data_path=options.except_data_path,
                                           data_format=options.data_format,
                                           print_net_info=False,
                                           isdirected=options.isdirected)
    except_net_nodes_size = net_except.get_nodes_size()
    except_net_edges_size = net_except.get_edges_size()
    logger.info("except_net_nodes_size = {}".format(except_net_nodes_size))
    logger.info("except_net_edges_size = {}".format(except_net_edges_size))

    id_list = list(range(eval_net_nodes_size))  # must be [0,1,2,3,...]
    SAMPLE_NODES = options.sample_nodes
    SAMPLE_RULE = options.sample_nodes_rule
    METIRC = options.similarity_metric
    PREC_K = options.precK_max_index

    # loading features_matrix(already trained)
    logger.info('\t reading embedding vectors from file {}'.format(
        options.vectors_path))
    time_start = time.time()
    features_matrix = utils.get_vectors(
        utils.get_KeyedVectors(options.vectors_path), id_list)
    logger.info(
        '\t reading embedding vectors completed in {}s'.format(time.time() -
                                                               time_start))
    logger.info('total loaded nodes: {}'.format(
        np.size(features_matrix, axis=0)))
    logger.info('the embedding dimension: {}'.format(
        np.size(features_matrix, axis=1)))

    fr = open(options.link_prediction_path, 'w')
    fr.write('eval case: link-prediction ...\n')
    fr.write('\t save_path: {}\n'.format(options.link_prediction_path))
    fr.write('\t eval_data_path: {}\n'.format(options.eval_data_path))
    fr.write('\t except_data_path: {}\n'.format(options.except_data_path))
    fr.write('\t data_format: {}\n'.format(options.data_format))
    fr.write('\t metrics: MAP and precise@K\n')
    fr.write('\t max_index for precise@K: {}\n'.format(
        options.precK_max_index))
    fr.write('\t similarity_metric: {}\n'.format(options.similarity_metric))
    fr.write('\t eval_online: {}\n'.format(options.eval_online))
    fr.write('\t eval_interval: {}s\n'.format(options.eval_interval))
    fr.write('\t sample_nodes: {}\n'.format(options.sample_nodes))
    fr.write('\t sample_nodes_rule: {}\n'.format(options.sample_nodes_rule))
    fr.write('\t repeat {} times\n'.format(options.repeated_times))
    fr.write('\t eval_workers: {}\n'.format(options.eval_workers))
    fr.write("eval_net_nodes_size = {}\n".format(eval_net_nodes_size))
    fr.write("eval_net_edges_size = {}\n".format(eval_net_edges_size))
    fr.write("except_net_nodes_size = {}\n".format(except_net_nodes_size))
    fr.write("except_net_edges_size = {}\n".format(except_net_edges_size))
    fr.write('total loaded nodes: {}\n'.format(np.size(features_matrix,
                                                       axis=0)))
    fr.write('the embedding dimension: {}\n'.format(
        np.size(features_matrix, axis=1)))

    if options.sample_nodes > 0:
        if options.eval_workers > 1 and options.repeated_times > 1:
            # speed up by using multi-process
            logger.info("\t allocating repeat_times to workers ...")
            if options.repeated_times <= options.eval_workers:
                times_per_worker = [1 for _ in range(options.repeated_times)]
            else:
                div, mod = divmod(options.repeated_times, options.eval_workers)
                times_per_worker = [div for _ in range(options.eval_workers)]
                for idx in range(mod):
                    times_per_worker[idx] = times_per_worker[idx] + 1
            assert sum(
                times_per_worker
            ) == options.repeated_times, 'workers allocating failed: %d != %d' % (
                sum(times_per_worker), options.repeated_times)

            logger.info("\t using {} processes for evaling:".format(
                len(times_per_worker)))
            for idx, rep_times in enumerate(times_per_worker):
                logger.info("\t process-{}: repeat {} times".format(
                    idx, rep_times))

            ret_list = []  # [[MAP, precisionK_list], ... ]
            with ProcessPoolExecutor(
                    max_workers=options.eval_workers) as executor:
                for ret in executor.map(_sample_thread_body, times_per_worker):
                    ret_list.extend(ret)
            if len(ret_list) != options.repeated_times:
                logger.warning(
                    "warning: eval unmatched repeated_times: {} != {}".format(
                        len(ret_list), options.repeated_times))
        else:
            ret_list = _sample_thread_body(options.repeated_times)
    else:
        # no sampling, no repeat!
        ret_list = [_eval(net_eval, net_except)]  # [[MAP, precisionK_list]]

    if options.sample_nodes > 0:
        fr.write(
            'expected repeated_times: {}, actual repeated_times: {}, mean results as follows:\n'
            .format(options.repeated_times, len(ret_list)))
    else:
        fr.write(
            'due to the sample nodes = {}, so actual repeated_times = {}, results as follows:\n'
            .format(options.sample_nodes, len(ret_list)))

    mean_MAP = np.mean([ret[0] for ret in ret_list])
    mean_precisionK = np.mean([ret[1] for ret in ret_list], axis=0)

    fr.write('\t\t MAP = {}\n'.format(mean_MAP))
    for k in range(options.precK_max_index):
        if k < len(mean_precisionK):
            fr.write('\t\t precisionK_{} = {}\n'.format(
                k + 1, mean_precisionK[k]))
        else:
            fr.write('\t\t precisionK_{} = None\n'.format(k + 1))
    fr.write('details:\n')
    for repeat in range(len(ret_list)):
        fr.write('\t repeated {}/{}:\n'.format(repeat + 1, len(ret_list)))
        MAP = ret_list[repeat][0]
        precisionK_list = ret_list[repeat][1]
        fr.write('\t\t MAP = {}\n'.format(MAP))
        for k in range(options.precK_max_index):
            if k < len(precisionK_list):
                fr.write('\t\t precisionK_{} = {}\n'.format(
                    k + 1, precisionK_list[k]))
            else:
                fr.write('\t\t precisionK_{} = None\n'.format(k + 1))

    fr.write(
        '\neval case: link_prediction completed in {}s.'.format(time.time() -
                                                                time_start))
    fr.close()
    logger.info(
        'eval case: link_prediction completed in {}s.'.format(time.time() -
                                                              time_start))

    return
Пример #10
0
def eval_online(options):
    global features_matrix, labels_matrix
    classify_dir = os.path.split(options.classify_path)[0]
    if not utils.check_rebuild(classify_dir,
                               descrip='classify',
                               always_rebuild=options.always_rebuild):
        return
    if not os.path.exists(classify_dir):
        os.makedirs(classify_dir)
    logger.info('eval case: classify...')
    logger.info('\t save_dir: {}'.format(classify_dir))
    logger.info('\t classifier: LogisticRegression')
    logger.info('\t eval_online: {}'.format(options.eval_online))
    logger.info('\t eval_interval: {}s'.format(options.eval_interval))
    logger.info('\t eval_workers: {}'.format(options.eval_workers))
    logger.info('\t total labels size: {}'.format(options.label_size))

    # repeated 10times
    repeated_times = options.repeated_times
    # split ratio
    if options.train_ratio > 0:
        train_ratio_list = [options.train_ratio]
    else:
        train_ratio_list = [v / 10.0 for v in range(9, 0, -1)]

    logger.info('\t repeat {} times for each train_ratio in {}'.format(
        repeated_times, train_ratio_list))

    train_ratio_fulllist = [
        train_ratio for train_ratio in train_ratio_list
        for _ in range(repeated_times)
    ]
    if options.eval_workers > 1 and len(train_ratio_fulllist) > 1:
        # speed up by using multi-process
        if len(train_ratio_fulllist) <= options.eval_workers:
            train_ratios_per_worker = [[train_ratio]
                                       for train_ratio in train_ratio_fulllist]
        else:
            div, mod = divmod(len(train_ratio_fulllist), options.eval_workers)
            train_ratios_per_worker = [
                train_ratio_fulllist[div * i:div * (i + 1)]
                for i in range(options.eval_workers)
            ]
            for idx, train_ratio in enumerate(
                    train_ratio_fulllist[div * options.eval_workers:]):
                train_ratios_per_worker[len(train_ratios_per_worker) - 1 -
                                        idx].append(train_ratio)
        logger.info("\t using {} processes for evaling:".format(
            len(train_ratios_per_worker)))
        for idx, train_ratios in enumerate(train_ratios_per_worker):
            logger.info("\t process-{}: {}".format(idx, train_ratios))

    fr_total = open(options.classify_path, 'w')
    fr_total.write('eval case: classify...\n')
    fr_total.write('\t save_dir: {}\n'.format(classify_dir))
    fr_total.write('\t classifier: LogisticRegression\n')
    fr_total.write('\t eval_online: {}\n'.format(options.eval_online))
    fr_total.write('\t eval_interval: {}s\n'.format(options.eval_interval))
    fr_total.write('\t eval_workers: {}\n'.format(options.eval_workers))
    fr_total.write('\t repeat {} times for each train_ratio in {}\n'.format(
        repeated_times, train_ratio_list))
    fr_total.write('\t total labels size: {}\n'.format(options.label_size))
    fr_total.write(
        '\t results(Macro_F1,Micro_F1):\n=============================================================\n'
    )
    fr_total.write(
        'finish_time\tckpt\t\t0.1\t0.2\t0.3\t0.4\t0.5\t0.6\t0.7\t0.8\t0.9\n')

    time_start = time.time()
    logger.info('\t reading labeled data from file {}'.format(
        options.label_path))
    id_list_totoal, labels_list_total = utils.get_labeled_data(
        options.label_path)
    logger.info('\t reading labeled data completed in {}s'.format(time.time() -
                                                                  time_start))

    last_step = 0
    summary_writer = tf.summary.FileWriter(classify_dir, tf.Graph())
    summary = tf.Summary()
    for train_ratio in train_ratio_list:
        summary.value.add(tag='macro_train_{}'.format(train_ratio),
                          simple_value=0.)
        summary.value.add(tag='micro_train_{}'.format(train_ratio),
                          simple_value=0.)
    summary_writer.add_summary(summary, last_step)

    best_micro = 0

    ckpt_dir = os.path.join(os.path.split(options.vectors_path)[0], 'ckpt')
    ckpt = tf.train.get_checkpoint_state(ckpt_dir)
    while (not (ckpt and ckpt.model_checkpoint_path)):
        logger.info("\t model and vectors not exist, waiting ...")
        time.sleep(options.eval_interval)
        ckpt = tf.train.get_checkpoint_state(ckpt_dir)

    reading = options.vectors_path + ".reading_classify"
    writing = options.vectors_path + ".writing"
    while (options.eval_online):
        while True:
            ckpt = tf.train.get_checkpoint_state(ckpt_dir)
            cur_step = int(
                ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
            ## synchrolock for multi-process:
            # while(not(cur_step > last_step and os.path.exists(options.vectors_path) and
            #                       time.time() - os.stat(options.vectors_path).st_mtime > 200)):
            #     time.sleep(options.eval_interval)
            #     ckpt = tf.train.get_checkpoint_state(ckpt_dir)
            #     cur_step = int(ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
            # os.utime(options.vectors_path, None)
            if cur_step <= last_step or (not os.path.exists(
                    options.vectors_path)) or os.path.exists(writing):
                if os.path.exists(
                        os.path.join(
                            os.path.split(options.vectors_path)[0],
                            "RUN_SUCCESS")):
                    return
                time.sleep(options.eval_interval)
                continue
            # ready for reading
            logger.info("\t declare for reading ...")
            open(reading, "w")  # declare
            time.sleep(30)
            ckpt = tf.train.get_checkpoint_state(ckpt_dir)
            cur_step = int(
                ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
            if cur_step <= last_step or (not os.path.exists(
                    options.vectors_path)) or os.path.exists(writing):
                os.remove(reading)  # undeclare
                logger.info("\t confliction! undeclare and waiting ...")
                time.sleep(options.eval_interval)
                continue

            break
        logger.info("\t eval ckpt-{}.......".format(cur_step))
        time_start = time.time()
        logger.info('\t reading embedding vectors from file {}'.format(
            options.vectors_path))
        features_matrix, labels_list = utils.get_vectors(
            utils.get_KeyedVectors(options.vectors_path), id_list_totoal,
            labels_list_total)
        os.remove(reading)  # synchrolock for multi-process
        logger.info("\t done for reading ...")
        mlb = MultiLabelBinarizer(range(options.label_size))
        labels_matrix = mlb.fit_transform(labels_list)
        logger.info('\t reading embedding vectors completed in {}s'.format(
            time.time() - time_start))
        logger.info('\t total labeled data size: {}'.format(
            np.size(features_matrix, axis=0)))
        logger.info('\t total labels size: {}'.format(options.label_size))

        # classify
        fr = open(options.classify_path + '.{}'.format(cur_step), 'w')
        fr.write('eval case: classify...\n')
        fr.write('\t classifier: LogisticRegression\n')
        fr.write('\t eval_workers: {}\n'.format(options.eval_workers))
        fr.write('\t repeat {} times for each train_ratio in {}\n'.format(
            repeated_times, train_ratio_list))
        fr.write('\t total labeled data size: {}\n'.format(
            np.size(features_matrix, axis=0)))
        fr.write('\t total labels size: {}\n'.format(options.label_size))
        for i in range(options.label_size):
            fr.write('\t\t label {}: {}\n'.format(i, np.sum(labels_matrix[:,
                                                                          i])))

        if options.eval_workers > 1 and len(train_ratio_fulllist) > 1:
            fr.write("\t using {} processes for evaling:\n".format(
                len(train_ratios_per_worker)))
            for idx, train_ratios in enumerate(train_ratios_per_worker):
                fr.write("\t process-{}: {}\n".format(idx, train_ratios))
            ret_list = []  # (train_ratio, macro, micro)
            with ProcessPoolExecutor(
                    max_workers=options.eval_workers) as executor:
                for ret in executor.map(_classify_thread_body,
                                        train_ratios_per_worker):
                    ret_list.extend(ret)
        else:
            ret_list = _classify_thread_body(train_ratio_fulllist)

        fr_total.write('%s ckpt-%-9d: ' % (time.strftime(
            '%Y-%m-%d %H:%M:%S', time.localtime(time.time())), cur_step))
        summary = tf.Summary()

        ret_dict = {}
        for ret in ret_list:
            if ret[0] in ret_dict:
                ret_dict[ret[0]][0].append(ret[1])
                ret_dict[ret[0]][1].append(ret[2])
            else:
                ret_dict[ret[0]] = [[ret[1]], [ret[2]]]

        for train_ratio, macro_micro in sorted(ret_dict.items(),
                                               key=lambda item: item[0]):
            fr.write('\n' + '-' * 20 + '\n' +
                     'train_ratio = {}\n'.format(train_ratio))
            Macro_F1_list = macro_micro[0]
            Micro_F1_list = macro_micro[1]
            if len(Macro_F1_list) != repeated_times:
                logger.warning(
                    "warning: train_ratio = {} eval unmatched repeated_times: {} != {}"
                    .format(train_ratio, len(Macro_F1_list), repeated_times))
            mean_Macro_F1 = sum(Macro_F1_list) / float(len(Macro_F1_list))
            mean_Micro_F1 = sum(Micro_F1_list) / float(len(Micro_F1_list))
            fr.write(
                'expected repeated_times: {}, actual repeated_times: {}, mean results as follows:\n'
                .format(repeated_times, len(Macro_F1_list)))
            fr.write('\t\t Macro_F1 = {}\n'.format(mean_Macro_F1))
            fr.write('\t\t Micro_F1 = {}\n'.format(mean_Micro_F1))
            fr.write('details:\n')
            for repeat in range(len(Macro_F1_list)):
                fr.write(
                    '\t repeated {}/{}: Macro_F1 = {}, Micro_F1 = {}\n'.format(
                        repeat + 1, len(Macro_F1_list), Macro_F1_list[repeat],
                        Micro_F1_list[repeat]))
            fr_total.write('%.4f, %.4f    ' % (mean_Macro_F1, mean_Micro_F1))
            summary.value.add(tag='macro_train_{}'.format(train_ratio),
                              simple_value=mean_Macro_F1)
            summary.value.add(tag='micro_train_{}'.format(train_ratio),
                              simple_value=mean_Micro_F1)

        fr.write(
            '\neval case: classify completed in {}s\n'.format(time.time() -
                                                              time_start))
        fr.close()
        fr_total.write('\n')
        fr_total.flush()
        summary_writer.add_summary(summary, cur_step)
        summary_writer.flush()
        logger.info(
            'classify completed in {}s\n================================='.
            format(time.time() - time_start))

        # copy ckpt-files according to last mean_Micro_F1 (0.9 ratio).
        if mean_Micro_F1 > best_micro:
            best_micro = mean_Micro_F1

            ckptIsExists = os.path.exists(
                os.path.join(ckpt_dir, 'model.ckpt-%d.index' % cur_step))
            if ckptIsExists:
                fr_best = open(os.path.join(classify_dir, 'best_ckpt.info'),
                               'w')
            else:
                fr_best = open(os.path.join(classify_dir, 'best_ckpt.info'),
                               'a')
                fr_best.write(
                    "Note:the model.ckpt-best is the remainings of last best_ckpt!\n"
                    "the current best_ckpt model is loss, but the result is:\n"
                )
            fr_best.write("best_micro(for ratio 0.9): {}\n".format(best_micro))
            fr_best.write("best_ckpt: ckpt-{}\n".format(cur_step))
            fr_best.close()

            if ckptIsExists:
                sourceFile = os.path.join(
                    ckpt_dir, 'model.ckpt-%d.data-00000-of-00001' % cur_step)
                targetFile = os.path.join(
                    classify_dir, 'model.ckpt-best.data-00000-of-00001')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)
                sourceFile = os.path.join(ckpt_dir,
                                          'model.ckpt-%d.index' % cur_step)
                targetFile = os.path.join(classify_dir,
                                          'model.ckpt-best.index')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)
                sourceFile = os.path.join(ckpt_dir,
                                          'model.ckpt-%d.meta' % cur_step)
                targetFile = os.path.join(classify_dir, 'model.ckpt-best.meta')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)

        last_step = cur_step

    fr_total.close()
    summary_writer.close()
Пример #11
0
def build_walk_corpus(options):
    global walker

    # check walk info  and record
    if not utils.check_rebuild(options.corpus_store_path,
                               descrip='walk corpus',
                               always_rebuild=options.always_rebuild):
        return
    if options.model == "DeepWalk":
        random_walker = "uniform"
        net = network.construct_network(options, isHIN=False)
    elif options.model == "SpaceyWalk":
        random_walker = "spacey"
        net = network.construct_network(options, isHIN=True)
    elif options.model == "MetatreeWalk":
        random_walker = "metatreewalk"
        net = network.construct_network(options, isHIN=True)
    else:
        logger.error("Unknown model or it cann't build walk corpus: '%s'." %
                     options.model)
        sys.exit()

    logger.info('Corpus bulid: walk info:')
    logger.info('\t data_dir = {}'.format(options.data_dir))
    logger.info('\t data_name = {}'.format(options.data_name))
    logger.info('\t isdirected = {}\n'.format(options.isdirected))
    logger.info('\t random_walker = {}'.format(random_walker))
    logger.info('\t walk_times = {}'.format(options.walk_times))
    logger.info('\t walk_length = {}'.format(options.walk_length))
    logger.info('\t max_walk_workers = {}'.format(options.walk_workers))
    logger.info('\t walk_to_memory = {}'.format(options.walk_to_memory))
    logger.info('\t seed = {}'.format(options.seed))
    logger.info('\t alpha = {}'.format(options.alpha))
    logger.info('\t window_size = {}'.format(options.window_size))
    logger.info('\t sample_size = {}'.format(options.sample_size))
    if options.walk_to_memory:
        logger.info('\t donot store corpus = {}'.format(
            str(options.not_store_corpus)))
        if not options.not_store_corpus:
            logger.info('\t corpus store path = {}'.format(
                options.corpus_store_path))
    else:
        logger.info('\t corpus store path = {}'.format(
            options.corpus_store_path))

    fr_walks = open(
        os.path.join(
            os.path.split(options.corpus_store_path)[0], 'walks.info'), 'w')
    fr_walks.write('Corpus walk info:\n')
    fr_walks.write('\t data_dir = {}\n'.format(options.data_dir))
    fr_walks.write('\t data_name = {}\n'.format(options.data_name))
    fr_walks.write('\t isdirected = {}\n\n'.format(options.isdirected))
    fr_walks.write('\t random_walker = {}\n'.format(random_walker))
    fr_walks.write('\t walk times = {}\n'.format(options.walk_times))
    fr_walks.write('\t walk length = {}\n'.format(options.walk_length))
    fr_walks.write('\t max walk workers = {}\n'.format(options.walk_workers))
    fr_walks.write('\t seed = {}\n'.format(options.seed))
    fr_walks.write('\t alpha = {}\n'.format(options.alpha))
    fr_walks.write('\t window_size = {}\n'.format(options.window_size))
    fr_walks.write('\t sample_size = {}\n'.format(options.sample_size))
    fr_walks.write('\t walk to memory = {}\n'.format(
        str(options.walk_to_memory)))
    if options.walk_to_memory:
        fr_walks.write('\t donot store corpus = {}\n'.format(
            str(options.not_store_corpus)))
        if not options.not_store_corpus:
            fr_walks.write('\t corpus store path = {}\n'.format(
                options.corpus_store_path))
    else:
        fr_walks.write('\t corpus store path = {}\n'.format(
            options.corpus_store_path))
    fr_walks.close()

    if options.model == "SpaceyWalk":
        if options.using_metapath == "metagraph":
            metagraph = network.construct_meta_graph(
                options.metapath_path, isdirected=options.isdirected)
        elif options.using_metapath == "metatree":
            metagraph = network.construct_meta_tree(options.metapath_path,
                                                    isdirected=True)
        elif options.using_metapath == "metaschema":
            metagraph = None
        else:
            logger.error("Unknown feature : '%s'." % options.using_metapath)
            sys.exit()
        walker = Walker(net,
                        random_walker=random_walker,
                        walk_length=options.walk_length,
                        metagraph=metagraph,
                        using_metapath=options.using_metapath,
                        history_position=options.history_position,
                        task="walk",
                        alpha=options.alpha)
    elif options.model == "MetatreeWalk":
        metagraph = network.construct_meta_tree(options.metapath_path,
                                                isdirected=True)
        walker = Walker(net,
                        random_walker=random_walker,
                        walk_length=options.walk_length,
                        metagraph=metagraph,
                        task="walk")

    corpus_store_dir = os.path.split(options.corpus_store_path)[0]
    if not os.path.exists(corpus_store_dir):
        os.makedirs(corpus_store_dir)

    logger.info(
        'Corpus bulid: walking and computing (using %d workers for multi-process)...'
        % options.walk_workers)
    time_start = time.time()

    if options.walk_times <= options.walk_workers:
        times_per_worker = [1 for _ in range(options.walk_times)]
    else:
        div, mod = divmod(options.walk_times, options.walk_workers)
        times_per_worker = [div for _ in range(options.walk_workers)]
        for idx in range(mod):
            times_per_worker[idx] = times_per_worker[idx] + 1
    assert sum(
        times_per_worker
    ) == options.walk_times, 'workers allocating failed: %d != %d' % (
        sum(times_per_worker), options.walk_times)

    nodes_total = list(range(walker.nodes_size))
    sp_random = random.Random(options.seed)
    sp_random.shuffle(nodes_total)
    nodes_total = nodes_total[0:options.sample_size]
    nodes_total.insert(0, 8407)
    nodes_total.insert(0, 9891)
    nodes_total.insert(0, 8354)
    nodes_total.insert(0, 8798)
    for node in nodes_total:
        args_list = []
        begin = 0
        for cnt in times_per_worker:
            args_list.append((corpus_store_dir, node, begin + 1, begin + cnt,
                              options.window_size))
            begin += cnt
        with ProcessPoolExecutor(max_workers=options.walk_workers) as executor:
            executor.map(_construct_walk_corpus_and_write_singprocess,
                         args_list)
    logger.info('Corpus bulid: walk completed in {}s'.format(time.time() -
                                                             time_start))
    del walker
    gc.collect()
    return
Пример #12
0
def build_walk_corpus(options):
    global walker

    # check walk info  and record
    if not utils.check_rebuild(options.corpus_store_path,
                               descrip='walk corpus',
                               always_rebuild=options.always_rebuild):
        return
    if options.model == "DeepWalk":
        random_walker = "uniform"
        net = network.construct_network(options, isHIN=False)
    elif options.model == "SpaceyWalk":
        random_walker = "spacey"
        net = network.construct_network(options, isHIN=True)
    elif options.model == "MetatreeWalk":
        random_walker = "metatreewalk"
        net = network.construct_network(options, isHIN=True)
    else:
        logger.error("Unknown model or it cann't build walk corpus: '%s'." %
                     options.model)
        sys.exit()

    logger.info('Corpus bulid: walk info:')
    logger.info('\t data_dir = {}'.format(options.data_dir))
    logger.info('\t data_name = {}'.format(options.data_name))
    logger.info('\t isdirected = {}\n'.format(options.isdirected))
    logger.info('\t random_walker = {}'.format(random_walker))
    logger.info('\t walk_times = {}'.format(options.walk_times))
    logger.info('\t walk_length = {}'.format(options.walk_length))
    logger.info('\t max_walk_workers = {}'.format(options.walk_workers))
    logger.info('\t walk_to_memory = {}'.format(options.walk_to_memory))
    logger.info('\t alpha = {}'.format(options.alpha))
    if options.walk_to_memory:
        logger.info('\t donot store corpus = {}'.format(
            str(options.not_store_corpus)))
        if not options.not_store_corpus:
            logger.info('\t corpus store path = {}'.format(
                options.corpus_store_path))
    else:
        logger.info('\t corpus store path = {}'.format(
            options.corpus_store_path))

    fr_walks = open(
        os.path.join(
            os.path.split(options.corpus_store_path)[0], 'walks.info'), 'w')
    fr_walks.write('Corpus walk info:\n')
    fr_walks.write('\t data_dir = {}\n'.format(options.data_dir))
    fr_walks.write('\t data_name = {}\n'.format(options.data_name))
    fr_walks.write('\t isdirected = {}\n\n'.format(options.isdirected))
    fr_walks.write('\t random_walker = {}\n'.format(random_walker))
    fr_walks.write('\t walk times = {}\n'.format(options.walk_times))
    fr_walks.write('\t walk length = {}\n'.format(options.walk_length))
    fr_walks.write('\t max walk workers = {}\n'.format(options.walk_workers))
    fr_walks.write('\t walk to memory = {}\n'.format(
        str(options.walk_to_memory)))
    if options.walk_to_memory:
        fr_walks.write('\t donot store corpus = {}\n'.format(
            str(options.not_store_corpus)))
        if not options.not_store_corpus:
            fr_walks.write('\t corpus store path = {}\n'.format(
                options.corpus_store_path))
    else:
        fr_walks.write('\t corpus store path = {}\n'.format(
            options.corpus_store_path))
    fr_walks.close()

    if options.model == "DeepWalk":
        walker = Walker(net,
                        random_walker=random_walker,
                        walk_length=options.walk_length)
    elif options.model == "SpaceyWalk":
        if options.using_metapath == "metagraph":
            metagraph = network.construct_meta_graph(
                options.metapath_path, isdirected=options.isdirected)
        elif options.using_metapath == "metatree":
            metagraph = network.construct_meta_tree(options.metapath_path,
                                                    isdirected=True)
        elif options.using_metapath == "metaschema":
            metagraph = None
        else:
            logger.error("Unknown feature : '%s'." % options.using_metapath)
            sys.exit()
        walker = Walker(net,
                        random_walker=random_walker,
                        walk_length=options.walk_length,
                        metagraph=metagraph,
                        using_metapath=options.using_metapath,
                        history_position=options.history_position,
                        task="walk",
                        alpha=options.alpha)
    elif options.model == "MetatreeWalk":
        metagraph = network.construct_meta_tree(options.metapath_path,
                                                isdirected=True)
        walker = Walker(net,
                        random_walker=random_walker,
                        walk_length=options.walk_length,
                        metagraph=metagraph,
                        task="walk")

    walk_corpus = None
    if options.walk_to_memory:
        walk_corpus = build_walk_corpus_to_memory(
            options.walk_times, max_num_workers=options.walk_workers)
        if not options.not_store_corpus:
            store_walk_corpus(options.corpus_store_path,
                              walk_corpus,
                              always_rebuild=options.always_rebuild)
    else:
        # walk to files
        walk_files = build_walk_corpus_to_files(
            options.corpus_store_path,
            options.walk_times,
            headflag_of_index_file=options.headflag_of_index_file,
            max_num_workers=options.walk_workers,
            always_rebuild=options.always_rebuild)
        if "train" in options.task:
            if options.load_from_memory:
                walk_corpus = load_walks_corpus(walk_files)
            else:
                walk_corpus = WalksCorpus(walk_files)
    del walker
    gc.collect()
    return walk_corpus
Пример #13
0
def eval_once(options):
    # visual_dir, visual_file = os.path.split(options.visualization_path)
    if not utils.check_rebuild(options.visualization_path, descrip='visualization', always_rebuild=options.always_rebuild):
        return
    # print logger
    logger.info('eval case: visualization...')
    logger.info('\t data_dir = {}'.format(options.data_dir))
    logger.info('\t data_name = {}'.format(options.data_name))
    logger.info('\t isdirected = {}'.format(options.isdirected))
    logger.info('\t label_path = {}'.format(options.label_path))
    logger.info('\t label_size = {}'.format(options.label_size))
    logger.info('\t eval_node_type: {}'.format(options.eval_node_type))
    logger.info('\t save_path: {}\n'.format(options.visualization_path))
    logger.info('\t method: t-SNE')
    logger.info('\t multilabel_rule: {}'.format(options.multilabel_rule))
    logger.info('\t marker_size: {}'.format(options.marker_size))
    logger.info('\t eval_online: {}'.format(options.eval_online))


    # get embedding vectors and markersize
    logger.info('\t reading labeled data from file {}'.format(options.label_path))
    time_start = time.time()
    id_list, labels_list = utils.get_labeled_data(options.label_path, type=options.eval_node_type,
                                                  multilabel_rule=options.multilabel_rule,
                                                  type_filepath=os.path.join(options.data_dir,
                                                                             options.data_name + ".nodes"))
    id_list, features_matrix, labels_list = utils.get_vectors(utils.get_KeyedVectors(options.vectors_path), id_list, labels_list)
    labels_matrix = np.array([item[0] for item in labels_list])
    logger.info('\t reading labeled data completed in {}s'.format(time.time() - time_start))
    logger.info('\t total labeled data size: {}'.format(np.size(features_matrix,axis=0)))
    logger.info('\t the labels data embedding_dimension: {}'.format(np.size(features_matrix,axis=1)))
    logger.info('\t total labels size: {}'.format(options.label_size))
    for i in range(options.label_size):
        logger.info('\t\t label {}: {}'.format(i, np.sum(labels_matrix == i)))

    fr = open(options.visualization_path, 'w')
    fr.write('eval case: visualization...\n')
    fr.write('\t data_dir = {}\n'.format(options.data_dir))
    fr.write('\t data_name = {}\n'.format(options.data_name))
    fr.write('\t isdirected = {}\n'.format(options.isdirected))
    fr.write('\t label_path = {}\n'.format(options.label_path))
    fr.write('\t label_size = {}\n'.format(options.label_size))
    fr.write('\t eval_node_type: {}\n'.format(options.eval_node_type))
    fr.write('\t save_path: {}\n\n'.format(options.visualization_path))
    fr.write('\t method: t-SNE\n')
    fr.write('\t multilabel_rule: {}\n'.format(options.multilabel_rule))
    fr.write('\t marker_size: {}\n'.format(options.marker_size))
    fr.write('\t eval_online: {}\n'.format(options.eval_online))
    fr.write('\t total labeled data size: {}\n'.format(np.size(features_matrix, axis=0)))
    fr.write('\t the labels data embedding_dimension: {}\n'.format(np.size(features_matrix, axis=1)))
    fr.write('\t total labels size: {}\n'.format(options.label_size))
    for i in range(options.label_size):
        fr.write('\t\t label {}: {}\n'.format(i, np.sum(labels_matrix==i)))

    figure_name = "visualization_" + str(np.size(features_matrix, axis=1))
    figure_path = os.path.join(os.path.split(options.visualization_path)[0],figure_name)
    CCD = plot_embedding_in_2D(Markersize=options.marker_size,
                               features_matrix=features_matrix,
                               labels_matrix=labels_matrix,
                               label_size=options.label_size,
                               figure_path = figure_path)

    fr.write('\n figure_path: {}\n'.format(figure_path))
    fr.write(' clustering_center_distance_sim: {}\n'.format(CCD))
    fr.write('\neval case: visualization completed in {}s\n ======================'.format(time.time() - time_start))
    fr.close()
    logger.info('eval case: visualization completed in {}s\n ======================'.format(time.time() - time_start))
Пример #14
0
def eval_online(options):
    visual_dir = os.path.split(options.visualization_path)[0]
    if not utils.check_rebuild(visual_dir, descrip='visualization', always_rebuild=options.always_rebuild):
        return
    if not os.path.exists(visual_dir):
        os.makedirs(visual_dir)

    # print logger
    logger.info('eval case: visualization...')
    logger.info('\t data_dir = {}'.format(options.data_dir))
    logger.info('\t data_name = {}'.format(options.data_name))
    logger.info('\t isdirected = {}'.format(options.isdirected))
    logger.info('\t label_path = {}'.format(options.label_path))
    logger.info('\t label_size = {}'.format(options.label_size))
    logger.info('\t eval_node_type: {}'.format(options.eval_node_type))
    logger.info('\t save_dir: {}\n'.format(visual_dir))
    logger.info('\t method: t-SNE')
    logger.info('\t multilabel_rule: {}'.format(options.multilabel_rule))
    logger.info('\t marker_size: {}'.format(options.marker_size))
    logger.info('\t eval_online: {}'.format(options.eval_online))
    logger.info('\t eval_interval: {}s'.format(options.eval_interval))


    logger.info('\t reading labeled data from file {}'.format(options.label_path))
    # get embedding vectors and markersize
    time_start = time.time()
    id_list_totoal, labels_list_totoal = utils.get_labeled_data(options.label_path, type=options.eval_node_type,
                                                                multilabel_rule=options.multilabel_rule,
                                                                type_filepath=os.path.join(options.data_dir,
                                                                                           options.data_name + ".nodes"))
    logger.info('\t reading labeled data completed in {}s'.format(time.time() - time_start))

    logger.info('\t total labeled data size: {}'.format(len(id_list_totoal)))
    logger.info('\t total labels size: {}'.format(options.label_size))


    fr_total = open(options.visualization_path, 'w')
    fr_total.write('eval case: visualization...\n')
    fr_total.write('\t data_dir = {}\n'.format(options.data_dir))
    fr_total.write('\t data_name = {}\n'.format(options.data_name))
    fr_total.write('\t isdirected = {}\n'.format(options.isdirected))
    fr_total.write('\t label_path = {}\n'.format(options.label_path))
    fr_total.write('\t label_size = {}\n'.format(options.label_size))
    fr_total.write('\t eval_node_type: {}\n'.format(options.eval_node_type))
    fr_total.write('\t save_dir: {}\n\n'.format(visual_dir))
    fr_total.write('\t method: t-SNE\n')
    fr_total.write('\t multilabel_rule: {}\n'.format(options.multilabel_rule))
    fr_total.write('\t marker_size: {}\n'.format(options.marker_size))
    fr_total.write('\t eval_online: {}\n'.format(options.eval_online))
    fr_total.write('\t eval_interval: {}s\n'.format(options.eval_interval))
    fr_total.write('\t total labeled data size: {}\n'.format(len(id_list_totoal)))
    fr_total.write('\t total labels size: {}\n'.format(options.label_size))
    fr_total.write('\t results(CCD-clustering_center_distance_sim):\n'
                   '=============================================================\n')
    fr_total.write('finish_time\tckpt\tCCD\n')


    last_step = 0
    summary_writer = tf.summary.FileWriter(visual_dir, tf.Graph())
    summary = tf.Summary()
    summary.value.add(tag='CCD', simple_value=0.)
    summary_writer.add_summary(summary, last_step)

    best_CCD = 0

    ckpt_dir = os.path.join(os.path.split(options.vectors_path)[0], 'ckpt')
    ckpt = tf.train.get_checkpoint_state(ckpt_dir)
    while (not (ckpt and ckpt.model_checkpoint_path)):
        logger.info("model and vectors not exist, waiting...")
        time.sleep(options.eval_interval)
        ckpt = tf.train.get_checkpoint_state(ckpt_dir)

    reading = options.vectors_path + ".reading_visualization_{}".format(options.eval_node_type)
    writing = options.vectors_path + ".writing"

    while (options.eval_online):
        while True:
            ckpt = tf.train.get_checkpoint_state(ckpt_dir)
            cur_step = int(ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
            if cur_step <= last_step or (not os.path.exists(options.vectors_path)) or os.path.exists(writing):
                if os.path.exists(os.path.join(os.path.split(options.vectors_path)[0], "RUN_SUCCESS")):
                    return
                time.sleep(options.eval_interval)
                continue
            # ready for reading
            logger.info("\t declare for reading ...")
            open(reading, "w")  # declare
            time.sleep(30)
            ckpt = tf.train.get_checkpoint_state(ckpt_dir)
            cur_step = int(ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
            if cur_step <= last_step or (not os.path.exists(options.vectors_path)) or os.path.exists(writing):
                os.remove(reading)  # undeclare
                logger.info("\t confliction! undeclare and waiting ...")
                time.sleep(options.eval_interval)
                continue
            break
        logger.info("\t eval ckpt-{}.......".format(cur_step))
        time_start = time.time()
        logger.info('\t reading embedding vectors from file {}'.format(options.vectors_path))
        id_list, features_matrix, labels_list = utils.get_vectors(utils.get_KeyedVectors(options.vectors_path),
                                                         id_list_totoal, labels_list_totoal)
        os.remove(reading)  # synchrolock for multi-process
        logger.info("\t done for reading ...")
        labels_matrix = np.array([item[0] for item in labels_list])
        logger.info('\t reading labeled data completed in {}s'.format(time.time() - time_start))
        logger.info('\t total labeled data size: {}'.format(np.size(features_matrix, axis=0)))
        logger.info('\t total labels size: {}'.format(options.label_size))
        for i in range(options.label_size):
            logger.info('\t\t label {}: {}'.format(i, np.sum(labels_matrix == i)))

        # visualization
        fr = open(options.visualization_path + '.{}'.format(cur_step), 'w')
        fr.write('eval case: visualization...\n')
        fr.write('\t data_dir = {}\n'.format(options.data_dir))
        fr.write('\t data_name = {}\n'.format(options.data_name))
        fr.write('\t isdirected = {}\n'.format(options.isdirected))
        fr.write('\t label_path = {}\n'.format(options.label_path))
        fr.write('\t label_size = {}\n'.format(options.label_size))
        fr.write('\t eval_node_type: {}\n'.format(options.eval_node_type))
        fr.write('\t method: t-SNE\n')
        fr.write('\t multilabel_rule: {}\n'.format(options.multilabel_rule))
        fr.write('\t marker_size: {}\n'.format(options.marker_size))
        fr.write('\t eval_online: {}\n'.format(options.eval_online))
        fr.write('\t eval_interval: {}s\n'.format(options.eval_interval))
        fr.write('\t total labeled data size: {}\n'.format(np.size(features_matrix, axis=0)))
        fr.write('\t total labels size: {}\n'.format(options.label_size))
        for i in range(options.label_size):
            fr.write('\t\t label {}: {}\n'.format(i, np.sum(labels_matrix == i)))

        fr_total.write('%s ckpt-%-9d: ' % (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), cur_step))
        summary = tf.Summary()

        figure_name = "visualization_" + str(np.size(features_matrix, axis=1)) + '.{}'.format(cur_step)
        figure_path = os.path.join(visual_dir, figure_name)
        CCD = plot_embedding_in_2D(Markersize=options.marker_size,
                                   features_matrix=features_matrix,
                                   labels_matrix=labels_matrix,
                                   label_size=options.label_size,
                                   figure_path=figure_path)

        fr.write('\n figure_path: {}\n'.format(figure_path))
        fr.write(' clustering_center_distance_sim:{}\n'.format(CCD))
        fr.write('\neval case: visualization completed in {}s\n ======================'.format(time.time() - time_start))
        fr.close()

        fr_total.write('%.4f\n' % CCD)
        fr_total.flush()
        summary.value.add(tag='CCD', simple_value=CCD)
        summary_writer.add_summary(summary, cur_step)
        summary_writer.flush()
        logger.info('visualization completed in {}s\n================================='.format(time.time() - time_start))

        # copy ckpt-files according to last mean_Micro_F1 (0.9 ratio).
        if CCD > best_CCD:
            best_CCD = CCD

            ckptIsExists = os.path.exists(os.path.join(ckpt_dir, 'model.ckpt-%d.index' % cur_step))
            if ckptIsExists:
                fr_best = open(os.path.join(visual_dir, 'best_ckpt.info'), 'w')
            else:
                fr_best = open(os.path.join(visual_dir, 'best_ckpt.info'), 'a')
                fr_best.write("Note:the model.ckpt-best is the remainings of last best_ckpt!\n"
                              "the current best_ckpt model is loss, but the result is:\n")
            fr_best.write("best_CCD: {}\n".format(best_CCD))
            fr_best.write("best_ckpt: ckpt-{}\n".format(cur_step))
            fr_best.close()

            if ckptIsExists:
                sourceFile = os.path.join(ckpt_dir, 'model.ckpt-%d.data-00000-of-00001' % cur_step)
                targetFile = os.path.join(visual_dir, 'model.ckpt-best.data-00000-of-00001')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)
                sourceFile = os.path.join(ckpt_dir, 'model.ckpt-%d.index' % cur_step)
                targetFile = os.path.join(visual_dir, 'model.ckpt-best.index')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)
                sourceFile = os.path.join(ckpt_dir, 'model.ckpt-%d.meta' % cur_step)
                targetFile = os.path.join(visual_dir, 'model.ckpt-best.meta')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)
        last_step = cur_step

    fr_total.close()
    summary_writer.close()
    return
Пример #15
0
def build_walk_corpus(options, net=None):
    global walker

    # check walk info  and record
    if not utils.check_rebuild(options.corpus_store_path,
                               descrip='walk corpus',
                               always_rebuild=options.always_rebuild):
        return
    if options.model == "DeepWalk":
        random_walker = "uniform"
    elif options.model == "Node2Vec":
        random_walker = "bias"
    else:
        logger.error("Unknown model or it cann't build walk corpus: '%s'." %
                     options.model)
        sys.exit()
    if net == None:
        net = network.construct_network(options)

    logger.info('Corpus bulid: walk info:')
    logger.info('\t random_walker = {}'.format(random_walker))
    logger.info('\t walk times = {}'.format(options.walk_times))
    logger.info('\t walk length = {}'.format(options.walk_length))
    if random_walker == "uniform":
        logger.info('\t walk restart = {}'.format(options.walk_restart))
    elif random_walker == "bias":
        logger.info('\t return_parameter (p) = {}'.format(options.p))
        logger.info('\t in-out_parameter (q) = {}'.format(options.q))
    logger.info('\t max walk workers = {}'.format(options.walk_workers))
    logger.info('\t walk to memory = {}'.format(str(options.walk_to_memory)))
    if options.walk_to_memory:
        logger.info('\t donot store corpus = {}'.format(
            str(options.not_store_corpus)))
        if not options.not_store_corpus:
            logger.info('\t corpus store path = {}'.format(
                options.corpus_store_path))
    else:
        logger.info('\t corpus store path = {}'.format(
            options.corpus_store_path))

    fr_walks = open(
        os.path.join(
            os.path.split(options.corpus_store_path)[0], 'walks.info'), 'w')
    fr_walks.write('Corpus walk info:\n')
    fr_walks.write('\t random_walker = {}\n'.format(random_walker))
    fr_walks.write('\t walk times = {}\n'.format(options.walk_times))
    fr_walks.write('\t walk length = {}\n'.format(options.walk_length))
    if random_walker == "uniform":
        fr_walks.write('\t walk restart = {}\n'.format(options.walk_restart))
    elif random_walker == "bias":
        fr_walks.write('\t return_parameter (p) = {}\n'.format(options.p))
        fr_walks.write('\t in-out_parameter (q) = {}\n'.format(options.q))
    fr_walks.write('\t max walk workers = {}\n'.format(options.walk_workers))
    fr_walks.write('\t walk to memory = {}\n'.format(
        str(options.walk_to_memory)))
    if options.walk_to_memory:
        fr_walks.write('\t donot store corpus = {}\n'.format(
            str(options.not_store_corpus)))
        if not options.not_store_corpus:
            fr_walks.write('\t corpus store path = {}\n'.format(
                options.corpus_store_path))
    else:
        fr_walks.write('\t corpus store path = {}\n'.format(
            options.corpus_store_path))
    fr_walks.close()

    walker = Walker(net,
                    random_walker=random_walker,
                    walk_length=options.walk_length,
                    p=options.p,
                    q=options.q)
    if random_walker == "bias":
        # walker.preprocess_transition_probs(options.walk_workers)
        walker.preprocess_transition_probs(net_info_path=options.net_info_path)

    walk_corpus = None
    if options.walk_to_memory:
        walk_corpus = build_walk_corpus_to_memory(
            options.walk_times, max_num_workers=options.walk_workers)
        if not options.not_store_corpus:
            store_walk_corpus(options.corpus_store_path,
                              walk_corpus,
                              always_rebuild=options.always_rebuild)
    else:
        # walk to files
        walk_files = build_walk_corpus_to_files(
            options.corpus_store_path,
            options.walk_times,
            headflag_of_index_file=options.headflag_of_index_file,
            max_num_workers=options.walk_workers,
            always_rebuild=options.always_rebuild)
        if "train" in options.task:
            if options.load_from_memory:
                walk_corpus = load_walks_corpus(walk_files)
            else:
                walk_corpus = WalksCorpus(walk_files)
    del walker
    gc.collect()
    return walk_corpus
Пример #16
0
def eval_online(options):
    global features_matrix, net_eval, net_except, SAMPLE_NODES, SAMPLE_RULE, METIRC, PREC_K
    link_prediction_dir = os.path.split(options.link_prediction_path)[0]
    if not utils.check_rebuild(link_prediction_dir,
                               descrip='link_prediction',
                               always_rebuild=options.always_rebuild):
        return
    if not os.path.exists(link_prediction_dir):
        os.makedirs(link_prediction_dir)

    logger.info('eval case: link-prediction ...')
    logger.info('\t save_path: {}'.format(options.link_prediction_path))
    logger.info('\t eval_data_path: {}'.format(options.eval_data_path))
    logger.info('\t except_data_path: {}'.format(options.except_data_path))
    logger.info('\t data_format: {}'.format(options.data_format))
    logger.info('\t metrics: MAP and precise@K')
    logger.info('\t max_index for precise@K: {}'.format(
        options.precK_max_index))
    logger.info('\t similarity_metric: {}'.format(options.similarity_metric))
    logger.info('\t eval_online: {}'.format(options.eval_online))
    logger.info('\t eval_interval: {}s'.format(options.eval_interval))
    logger.info('\t sample_nodes: {}'.format(options.sample_nodes))
    logger.info('\t sample_nodes_rule: {}'.format(options.sample_nodes_rule))
    logger.info('\t repeat {} times'.format(options.repeated_times))
    logger.info('\t eval_workers: {}'.format(options.eval_workers))

    logger.info("constructing eval network ...")
    net_eval = network.construct_network(data_path=options.eval_data_path,
                                         data_format=options.data_format,
                                         print_net_info=False,
                                         isdirected=options.isdirected)
    eval_net_nodes_size = net_eval.get_nodes_size()
    eval_net_edges_size = net_eval.get_edges_size()
    logger.info("eval_net_nodes_size = {}".format(eval_net_nodes_size))
    logger.info("eval_net_edges_size = {}".format(eval_net_edges_size))

    logger.info("constructing except(train) network ...")
    net_except = network.construct_network(data_path=options.except_data_path,
                                           data_format=options.data_format,
                                           print_net_info=False,
                                           isdirected=options.isdirected)
    except_net_nodes_size = net_except.get_nodes_size()
    except_net_edges_size = net_except.get_edges_size()
    logger.info("except_net_nodes_size = {}".format(except_net_nodes_size))
    logger.info("except_net_edges_size = {}".format(except_net_edges_size))

    id_list = list(range(eval_net_nodes_size))  # must be [0,1,2,3,...]
    SAMPLE_NODES = options.sample_nodes
    SAMPLE_RULE = options.sample_nodes_rule
    METIRC = options.similarity_metric
    PREC_K = options.precK_max_index

    metric_prec_k_list = [1]
    decimal_number = 10
    while metric_prec_k_list[-1] < options.precK_max_index:
        if decimal_number <= options.precK_max_index:
            metric_prec_k_list.append(decimal_number)
        else:
            break
        if 2 * decimal_number <= options.precK_max_index:
            metric_prec_k_list.append(2 * decimal_number)
        else:
            break
        if 5 * decimal_number <= options.precK_max_index:
            metric_prec_k_list.append(5 * decimal_number)
        else:
            break
        decimal_number = decimal_number * 10

    if options.sample_nodes > 0:
        if options.eval_workers > 1 and options.repeated_times > 1:
            # speed up by using multi-process
            logger.info("\t allocating repeat_times to workers ...")
            if options.repeated_times <= options.eval_workers:
                times_per_worker = [1 for _ in range(options.repeated_times)]
            else:
                div, mod = divmod(options.repeated_times, options.eval_workers)
                times_per_worker = [div for _ in range(options.eval_workers)]
                for idx in range(mod):
                    times_per_worker[idx] = times_per_worker[idx] + 1
            assert sum(
                times_per_worker
            ) == options.repeated_times, 'workers allocating failed: %d != %d' % (
                sum(times_per_worker), options.repeated_times)

            logger.info("\t using {} processes for evaling:".format(
                len(times_per_worker)))
            for idx, rep_times in enumerate(times_per_worker):
                logger.info("\t process-{}: repeat {} times".format(
                    idx, rep_times))

    fr_total = open(options.link_prediction_path, 'w')
    fr_total.write('eval case: link-prediction ...\n')
    fr_total.write('\t save_path: {}\n'.format(options.link_prediction_path))
    fr_total.write('\t eval_data_path: {}\n'.format(options.eval_data_path))
    fr_total.write('\t except_data_path: {}\n'.format(
        options.except_data_path))
    fr_total.write('\t data_format: {}\n'.format(options.data_format))
    fr_total.write('\t metrics: MAP and precise@K\n')
    fr_total.write('\t max_index for precise@K: {}\n'.format(
        options.precK_max_index))
    fr_total.write('\t similarity_metric: {}\n'.format(
        options.similarity_metric))
    fr_total.write('\t eval_online: {}\n'.format(options.eval_online))
    fr_total.write('\t eval_interval: {}s\n'.format(options.eval_interval))
    fr_total.write('\t sample_nodes: {}\n'.format(options.sample_nodes))
    fr_total.write('\t sample_nodes_rule: {}\n'.format(
        options.sample_nodes_rule))
    fr_total.write('\t repeat {} times\n'.format(options.repeated_times))
    fr_total.write('\t eval_workers: {}\n'.format(options.eval_workers))
    fr_total.write("eval_net_nodes_size = {}\n".format(eval_net_nodes_size))
    fr_total.write("eval_net_edges_size = {}\n".format(eval_net_edges_size))
    fr_total.write(
        "except_net_nodes_size = {}\n".format(except_net_nodes_size))
    fr_total.write(
        "except_net_edges_size = {}\n".format(except_net_edges_size))
    fr_total.write(
        '\t results:\n=============================================================\n'
    )
    fr_total.write('finish_time\tckpt\tMAP\t')
    for v in metric_prec_k_list:
        fr_total.write('\tPr@{}'.format(v))
    fr_total.write("\n")

    last_step = 0
    summary_writer = tf.summary.FileWriter(link_prediction_dir, tf.Graph())
    summary = tf.Summary()
    summary.value.add(tag='MAP', simple_value=0.)
    for v in metric_prec_k_list:
        summary.value.add(tag='Pr_{}'.format(v), simple_value=0.)
    summary_writer.add_summary(summary, last_step)

    best_MAP = 0

    ckpt_dir = os.path.join(os.path.split(options.vectors_path)[0], 'ckpt')
    ckpt = tf.train.get_checkpoint_state(ckpt_dir)
    while (not (ckpt and ckpt.model_checkpoint_path)):
        logger.info("\t model and vectors not exist, waiting ...")
        time.sleep(options.eval_interval)
        ckpt = tf.train.get_checkpoint_state(ckpt_dir)

    reading = options.vectors_path + ".reading_link_prediction"
    writing = options.vectors_path + ".writing"
    while (options.eval_online):
        while True:
            ckpt = tf.train.get_checkpoint_state(ckpt_dir)
            cur_step = int(
                ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
            if cur_step <= last_step or (not os.path.exists(
                    options.vectors_path)) or os.path.exists(writing):
                if os.path.exists(
                        os.path.join(
                            os.path.split(options.vectors_path)[0],
                            "RUN_SUCCESS")):
                    return
                time.sleep(options.eval_interval)
                continue
            # ready for reading
            logger.info("\t declare for reading ...")
            open(reading, "w")  # declare
            time.sleep(30)
            ckpt = tf.train.get_checkpoint_state(ckpt_dir)
            cur_step = int(
                ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
            if cur_step <= last_step or (not os.path.exists(
                    options.vectors_path)) or os.path.exists(writing):
                os.remove(reading)  # undeclare
                logger.info("\t confliction! undeclare and waiting ...")
                time.sleep(options.eval_interval)
                continue

            break
        logger.info("\t eval ckpt-{}.......".format(cur_step))
        # loading features_matrix(already trained)
        logger.info('\t reading embedding vectors from file {}'.format(
            options.vectors_path))
        time_start = time.time()
        features_matrix = utils.get_vectors(
            utils.get_KeyedVectors(options.vectors_path), id_list)
        os.remove(reading)
        logger.info("\t done for reading ...")
        logger.info('\t reading embedding vectors completed in {}s'.format(
            time.time() - time_start))
        logger.info('total loaded nodes: {}'.format(
            np.size(features_matrix, axis=0)))
        logger.info('the embedding dimension: {}'.format(
            np.size(features_matrix, axis=1)))

        #
        fr = open(options.link_prediction_path + '.{}'.format(cur_step), 'w')
        fr.write('eval case: link-prediction ...\n')
        fr.write('\t save_path: {}\n'.format(options.link_prediction_path))
        fr.write('\t eval_data_path: {}\n'.format(options.eval_data_path))
        fr.write('\t except_data_path: {}\n'.format(options.except_data_path))
        fr.write('\t data_format: {}\n'.format(options.data_format))
        fr.write('\t metrics: MAP and precise@K\n')
        fr.write('\t max_index for precise@K: {}\n'.format(
            options.precK_max_index))
        fr.write('\t similarity_metric: {}\n'.format(
            options.similarity_metric))
        fr.write('\t eval_online: {}\n'.format(options.eval_online))
        fr.write('\t eval_interval: {}s\n'.format(options.eval_interval))
        fr.write('\t sample_nodes: {}\n'.format(options.sample_nodes))
        fr.write('\t sample_nodes_rule: {}\n'.format(
            options.sample_nodes_rule))
        fr.write('\t repeat {} times\n'.format(options.repeated_times))
        fr.write('\t eval_workers: {}\n'.format(options.eval_workers))
        fr.write("eval_net_nodes_size = {}\n".format(eval_net_nodes_size))
        fr.write("eval_net_edges_size = {}\n".format(eval_net_edges_size))
        fr.write("except_net_nodes_size = {}\n".format(except_net_nodes_size))
        fr.write("except_net_edges_size = {}\n".format(except_net_edges_size))
        fr.write('total loaded nodes: {}\n'.format(
            np.size(features_matrix, axis=0)))
        fr.write('the embedding dimension: {}\n'.format(
            np.size(features_matrix, axis=1)))

        if options.sample_nodes > 0:
            if options.eval_workers > 1 and options.repeated_times > 1:
                # speed up by using multi-process
                ret_list = []  # [[MAP, precisionK_list], ... ]
                with ProcessPoolExecutor(
                        max_workers=options.eval_workers) as executor:
                    for ret in executor.map(_sample_thread_body,
                                            times_per_worker):
                        ret_list.extend(ret)
                if len(ret_list) != options.repeated_times:
                    logger.warning(
                        "warning: eval unmatched repeated_times: {} != {}".
                        format(len(ret_list), options.repeated_times))
            else:
                ret_list = _sample_thread_body(options.repeated_times)
        else:
            # no sampling, no repeat!
            ret_list = [_eval(net_eval,
                              net_except)]  # [[MAP, precisionK_list]]

        fr_total.write('%s ckpt-%-9d: ' % (time.strftime(
            '%Y-%m-%d %H:%M:%S', time.localtime(time.time())), cur_step))
        summary = tf.Summary()

        if options.sample_nodes > 0:
            fr.write(
                'expected repeated_times: {}, actual repeated_times: {}, mean results as follows:\n'
                .format(options.repeated_times, len(ret_list)))
        else:
            fr.write(
                'due to the sample nodes = {}, so actual repeated_times = {}, results as follows:\n'
                .format(options.sample_nodes, len(ret_list)))

        mean_MAP = np.mean([ret[0] for ret in ret_list])
        mean_precisionK = np.mean([ret[1] for ret in ret_list], axis=0)

        fr.write('\t\t MAP = {}\n'.format(mean_MAP))
        for k in range(options.precK_max_index):
            if k < len(mean_precisionK):
                fr.write('\t\t precisionK_{} = {}\n'.format(
                    k + 1, mean_precisionK[k]))
            else:
                fr.write('\t\t precisionK_{} = None\n'.format(k + 1))
        fr.write('details:\n')
        for repeat in range(len(ret_list)):
            fr.write('\t repeated {}/{}:\n'.format(repeat + 1, len(ret_list)))
            MAP = ret_list[repeat][0]
            precisionK_list = ret_list[repeat][1]
            fr.write('\t\t MAP = {}\n'.format(MAP))
            for k in range(options.precK_max_index):
                if k < len(precisionK_list):
                    fr.write('\t\t precisionK_{} = {}\n'.format(
                        k + 1, precisionK_list[k]))
                else:
                    fr.write('\t\t precisionK_{} = None\n'.format(k + 1))

        fr.write('\neval case: link_prediction completed in {}s.'.format(
            time.time() - time_start))
        fr.close()

        fr_total.write('%.4f' % mean_MAP)
        summary.value.add(tag='MAP', simple_value=mean_MAP)
        for v in metric_prec_k_list:
            fr_total.write('\t%.4f' % mean_precisionK[v - 1])
            summary.value.add(tag='Pr_{}'.format(v),
                              simple_value=mean_precisionK[v - 1])
        fr_total.write("\n")
        fr_total.flush()
        summary_writer.add_summary(summary, cur_step)
        summary_writer.flush()
        logger.info(
            'eval case: ret_list completed in {}s.\n================================='
            .format(time.time() - time_start))

        # copy ckpt-files according to last mean_Micro_F1 (0.9 ratio).
        if mean_MAP > best_MAP:
            best_MAP = mean_MAP

            ckptIsExists = os.path.exists(
                os.path.join(ckpt_dir, 'model.ckpt-%d.index' % cur_step))
            if ckptIsExists:
                fr_best = open(
                    os.path.join(link_prediction_dir, 'best_ckpt.info'), 'w')
            else:
                fr_best = open(
                    os.path.join(link_prediction_dir, 'best_ckpt.info'), 'a')
                fr_best.write(
                    "Note:the model.ckpt-best is the remainings of last best_ckpt!\n"
                    "the current best_ckpt model is loss, but the result is:\n"
                )
            fr_best.write("best_MAP: {}\n".format(best_MAP))
            fr_best.write("best_ckpt: ckpt-{}\n".format(cur_step))
            fr_best.close()

            if ckptIsExists:
                sourceFile = os.path.join(
                    ckpt_dir, 'model.ckpt-%d.data-00000-of-00001' % cur_step)
                targetFile = os.path.join(
                    link_prediction_dir, 'model.ckpt-best.data-00000-of-00001')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)
                sourceFile = os.path.join(ckpt_dir,
                                          'model.ckpt-%d.index' % cur_step)
                targetFile = os.path.join(link_prediction_dir,
                                          'model.ckpt-best.index')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)
                sourceFile = os.path.join(ckpt_dir,
                                          'model.ckpt-%d.meta' % cur_step)
                targetFile = os.path.join(link_prediction_dir,
                                          'model.ckpt-best.meta')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)

        last_step = cur_step

    fr_total.close()
    summary_writer.close()
Пример #17
0
def eval_once(options):
    global features_matrix, labels_matrix
    if not utils.check_rebuild(options.classify_path,
                               descrip='classify',
                               always_rebuild=options.always_rebuild):
        return
    logger.info('eval case: classify...')
    logger.info('\t save_path: {}'.format(options.classify_path))
    logger.info('\t classifier: LogisticRegression')
    logger.info('\t eval_online: {}'.format(options.eval_online))
    logger.info('\t eval_workers: {}'.format(options.eval_workers))

    logger.info('\t reading labeled data from file {}'.format(
        options.label_path))
    time_start = time.time()
    id_list, labels_list = utils.get_labeled_data(options.label_path)
    features_matrix, labels_list = utils.get_vectors(
        utils.get_KeyedVectors(options.vectors_path), id_list, labels_list)
    mlb = MultiLabelBinarizer(range(options.label_size))
    labels_matrix = mlb.fit_transform(labels_list)
    logger.info('\t reading labeled data completed in {}s'.format(time.time() -
                                                                  time_start))
    logger.info('\t total labeled data size: {}'.format(
        np.size(features_matrix, axis=0)))
    logger.info('\t total labels size: {}'.format(options.label_size))
    # repeated 10times
    repeated_times = options.repeated_times
    # split ratio
    if options.train_ratio > 0:
        train_ratio_list = [options.train_ratio]
    else:
        train_ratio_list = [v / 10.0 for v in range(9, 0, -1)]

    logger.info('\t repeat {} times for each train_ratio in {}'.format(
        repeated_times, train_ratio_list))

    train_ratio_fulllist = [
        train_ratio for train_ratio in train_ratio_list
        for _ in range(repeated_times)
    ]

    # classify
    fr = open(options.classify_path, 'w')
    fr.write('eval case: classify...\n')
    fr.write('\t save_path: {}\n'.format(options.classify_path))
    fr.write('\t classifier: LogisticRegression\n')
    fr.write('\t eval_online: {}\n'.format(options.eval_online))
    fr.write('\t eval_workers: {}\n'.format(options.eval_workers))
    fr.write('\t repeat {} times for each train_ratio in {}\n'.format(
        repeated_times, train_ratio_list))
    fr.write('\t total labeled data size: {}\n'.format(
        np.size(features_matrix, axis=0)))
    fr.write('\t total labels size: {}\n'.format(options.label_size))
    for i in range(options.label_size):
        fr.write('\t\t label {}: {}\n'.format(i, np.sum(labels_matrix[:, i])))

    if options.eval_workers > 1 and len(train_ratio_fulllist) > 1:
        # speed up by using multi-process
        if len(train_ratio_fulllist) <= options.eval_workers:
            train_ratios_per_worker = [[train_ratio]
                                       for train_ratio in train_ratio_fulllist]
        else:
            div, mod = divmod(len(train_ratio_fulllist), options.eval_workers)
            train_ratios_per_worker = [
                train_ratio_fulllist[div * i:div * (i + 1)]
                for i in range(options.eval_workers)
            ]
            for idx, train_ratio in enumerate(
                    train_ratio_fulllist[div * options.eval_workers:]):
                train_ratios_per_worker[len(train_ratios_per_worker) - 1 -
                                        idx].append(train_ratio)
        logger.info("\t using {} processes for evaling:".format(
            len(train_ratios_per_worker)))
        for idx, train_ratios in enumerate(train_ratios_per_worker):
            logger.info("\t process-{}: {}".format(idx, train_ratios))
        ret_list = []  # (train_ratio, macro, micro)
        with ProcessPoolExecutor(max_workers=options.eval_workers) as executor:
            for ret in executor.map(_classify_thread_body,
                                    train_ratios_per_worker):
                ret_list.extend(ret)
    else:
        ret_list = _classify_thread_body(train_ratio_fulllist)

    ret_dict = {}
    for ret in ret_list:
        if ret[0] in ret_dict:
            ret_dict[ret[0]][0].append(ret[1])
            ret_dict[ret[0]][1].append(ret[2])
        else:
            ret_dict[ret[0]] = [[ret[1]], [ret[2]]]

    for train_ratio, macro_micro in sorted(ret_dict.items(),
                                           key=lambda item: item[0]):
        fr.write('\n' + '-' * 20 + '\n' +
                 'train_ratio = {}\n'.format(train_ratio))
        Macro_F1_list = macro_micro[0]
        Micro_F1_list = macro_micro[1]
        if len(Macro_F1_list) != repeated_times:
            logger.warning(
                "warning: train_ratio = {} eval unmatched repeated_times: {} != {}"
                .format(train_ratio, len(Macro_F1_list), repeated_times))
        mean_Macro_F1 = sum(Macro_F1_list) / float(len(Macro_F1_list))
        mean_Micro_F1 = sum(Micro_F1_list) / float(len(Micro_F1_list))
        fr.write(
            'expected repeated_times: {}, actual repeated_times: {}, mean results as follows:\n'
            .format(repeated_times, len(Macro_F1_list)))
        fr.write('\t\t Macro_F1 = {}\n'.format(mean_Macro_F1))
        fr.write('\t\t Micro_F1 = {}\n'.format(mean_Micro_F1))
        fr.write('details:\n')
        for repeat in range(len(Macro_F1_list)):
            fr.write(
                '\t repeated {}/{}: Macro_F1 = {}, Micro_F1 = {}\n'.format(
                    repeat + 1, len(Macro_F1_list), Macro_F1_list[repeat],
                    Micro_F1_list[repeat]))
    fr.write('\neval case: classify completed in {}s'.format(time.time() -
                                                             time_start))
    fr.close()
    logger.info('eval case: classify completed in {}s'.format(time.time() -
                                                              time_start))
Пример #18
0
def generate_train_data(corpus_store_path,
                        headflag_of_index_file,
                        train_workers,
                        window_size,
                        idx_vocab_freq_file,
                        sens=None,
                        always_rebuild=False):
    corpusfiles_list = []
    if sens == None:
        # check index file
        with open(corpus_store_path, 'r') as f:
            headline = f.readline().strip()
            if headline == headflag_of_index_file:
                logger.info('generate training examples from corpus files: ')
                for line in f:
                    line = line.strip()
                    if line[0:5] == 'FILE:':
                        if os.path.exists(line[6:]):
                            logger.info('corpus file: {}'.format(line[6:]))
                            corpusfiles_list.append(line[6:])
                        else:
                            logger.warning(
                                'cannot find corpus file: {}, skiped.'.format(
                                    line[6:]))
            else:
                corpusfiles_list.append(corpus_store_path)
                logger.info(
                    'generate training examples from file: {}...'.format(
                        corpusfiles_list))
    else:
        logger.info('generate training examples from memory sentences...')
    # generate train data
    if utils.check_rebuild(idx_vocab_freq_file,
                           descrip='vocab and frequencies',
                           always_rebuild=always_rebuild):
        logger.info('get vocabs ...')
        time_start = time.time()
        if sens == None:
            vocabs = scan_files_using_multiprocess(
                corpusfiles_list,
                max_num_workers=train_workers,
                func=get_vocabs_from_files)
        else:
            vocabs = [get_vocabs_from_sentences(sens)]
        logger.info('get vocabs completed in {}s'.format(time.time() -
                                                         time_start))
        logger.info('get frequencies ...')
        time_start = time.time()
        vocab2idx, idx2vocab, nodes_frequencies = scan_vocabs(
            vocabs, idx_vocab_freq_file)
        logger.info('get frequencies completed in {}s'.format(time.time() -
                                                              time_start))
    else:
        logger.info("get vocab and frequencies from file: {}".format(
            idx_vocab_freq_file))
        time_start = time.time()
        vocab2idx = {}
        idx2vocab = []
        nodes_frequencies = []
        # count = 0
        for line in open(idx_vocab_freq_file):
            linelist = line.strip().split(' ')
            idx = int(linelist[0])
            node = int(linelist[1])
            freq = float(linelist[2])
            vocab2idx[node] = idx
            # assert count == idx, "error, %d != %d" %(count,idx)
            # count += 1
            idx2vocab.append(idx)
            nodes_frequencies.append(freq)
        logger.info(
            'get vocab and frequencies completed in {}s'.format(time.time() -
                                                                time_start))

    logger.info('get training examples ...')
    time_start = time.time()
    if sens == None:
        rets = scan_files_using_multiprocess(corpusfiles_list,
                                             max_num_workers=train_workers,
                                             func=get_examples_from_files,
                                             args=(vocab2idx, window_size))
    else:
        rets = [get_examples_from_sentences(sens, vocab2idx, window_size)]
    logger.info('get training examples completed in {}s'.format(time.time() -
                                                                time_start))
    data = np.concatenate([item[0] for item in rets])
    labels = np.concatenate([item[1] for item in rets])
    # logger.info('total nodes: {}, total examples: {}'.format(len(nodes_frequencies), len(data)))
    # dataset = DataSet(data=data, labels=labels, shuffled= not options.unshuffled)
    return data, labels, idx2vocab, nodes_frequencies
Пример #19
0
def eval_once(options, net):
    global features_dict, true_edges_list_by_repeat, neg_edges_list_by_repeat
    if not utils.check_rebuild(options.link_prediction_path,
                               descrip='link_prediction',
                               always_rebuild=options.always_rebuild):
        return
    logger.info('eval case: link_prediction...')
    logger.info('\t data_dir = {}'.format(options.data_dir))
    logger.info('\t data_name = {}'.format(options.data_name))
    logger.info('\t isdirected = {}'.format(options.isdirected))
    logger.info('\t eval_edge_type: {}'.format(options.eval_edge_type))
    logger.info('\t save_path: {}\n'.format(options.link_prediction_path))
    logger.info('\t classifier: LogisticRegression')
    logger.info('\t eval_online: {}'.format(options.eval_online))
    logger.info('\t eval_workers: {}'.format(options.eval_workers))
    logger.info('\t repeated_times: {}'.format(options.repeated_times))
    logger.info('\t feature_operators: {}'.format(options.feature_operators))
    logger.info('\t sample_size: {}'.format(options.sample_size))

    time_start = time.time()

    load_features(options, net)
    load_edges(options, net)

    logger.info('\t total true edges size: {}'.format(
        len(true_edges_list_by_repeat[0])))
    logger.info('\t total neg edges size: {}'.format(
        len(neg_edges_list_by_repeat[0])))

    # repeated 10times
    repeated_times = options.repeated_times
    # split ratio
    if options.train_ratio > 0:
        train_ratio_list = [options.train_ratio]
    else:
        train_ratio_list = [0.01, 0.05] + [v / 10.0 for v in range(1, 10)]

    logger.info('\t repeat {} times for each train_ratio in {}'.format(
        repeated_times, train_ratio_list))

    fr = open(options.link_prediction_path, 'w')
    fr.write('eval case: link-prediction ...\n')
    fr.write('\t data_dir = {}\n'.format(options.data_dir))
    fr.write('\t data_name = {}\n'.format(options.data_name))
    fr.write('\t isdirected = {}\n'.format(options.isdirected))
    fr.write('\t eval_edge_type: {}\n'.format(options.eval_edge_type))
    fr.write('\t save_path: {}\n\n'.format(options.link_prediction_path))
    fr.write('\t classifier: LogisticRegression\n')
    fr.write('\t eval_online: {}\n'.format(options.eval_online))
    fr.write('\t eval_workers: {}\n'.format(options.eval_workers))
    fr.write('\t feature_operators: {}\n'.format(options.feature_operators))
    fr.write('\t repeated_times: {}\n'.format(options.repeated_times))
    fr.write('\t sample_size: {}\n'.format(options.sample_size))
    fr.write('\t total true edges size: {}\n'.format(
        len(true_edges_list_by_repeat[0])))
    fr.write('\t total neg edges size: {}\n'.format(
        len(neg_edges_list_by_repeat[0])))
    fr.write('\t repeat {} times for each train_ratio in {}\n'.format(
        repeated_times, train_ratio_list))

    full_train_ratio_info_list = []
    for repeat in range(repeated_times):
        for op in options.feature_operators:
            for train_ratio in train_ratio_list:
                full_train_ratio_info_list.append((repeat, op, train_ratio))

    if options.eval_workers > 1 and len(full_train_ratio_info_list) > 1:
        # speed up by using multi-process
        if len(full_train_ratio_info_list) <= options.eval_workers:
            train_ratios_per_worker = [[
                train_ratio_info
            ] for train_ratio_info in full_train_ratio_info_list]
        else:
            div, mod = divmod(len(full_train_ratio_info_list),
                              options.eval_workers)
            train_ratios_per_worker = [
                full_train_ratio_info_list[div * i:div * (i + 1)]
                for i in range(options.eval_workers)
            ]
            for idx, train_ratio_info in enumerate(
                    full_train_ratio_info_list[div * options.eval_workers:]):
                train_ratios_per_worker[idx].append(train_ratio_info)
        logger.info("\t using {} processes for evaling:".format(
            len(train_ratios_per_worker)))
        for idx, train_ratios in enumerate(train_ratios_per_worker):
            logger.info("\t process-{}: {}".format(idx, train_ratios))

        try:
            ret_list = []  # (train_ratio, op, auc)
            with ProcessPoolExecutor(
                    max_workers=options.eval_workers) as executor:
                for ret in executor.map(_classify_thread_body,
                                        train_ratios_per_worker):
                    ret_list.extend(ret)
        except:
            logger.warning("concurrent.futures.process failed, retry...")
            time.sleep(10)
            ret_list = []  # (train_ratio, op, auc)
            with ProcessPoolExecutor(
                    max_workers=options.eval_workers) as executor:
                for ret in executor.map(_classify_thread_body,
                                        train_ratios_per_worker):
                    ret_list.extend(ret)

    else:
        ret_list = _classify_thread_body(full_train_ratio_info_list)

    ret_dict = {}
    for train_ratio, op, auc in ret_list:  # ret: (train_ratio, op, auc)
        if (train_ratio, op) in ret_dict:
            ret_dict[(train_ratio, op)].append(auc)
        else:
            ret_dict[(train_ratio, op)] = [auc]

    for train_ratio in train_ratio_list:
        for op in options.feature_operators:
            fr.write(
                '\n' + '-' * 20 + '\n' +
                'train_ratio = {}, operator = {}\n'.format(train_ratio, op))
            auc_list = ret_dict[(train_ratio, op)]
            if len(auc_list) != repeated_times:
                logger.warning(
                    "warning: train_ratio={},operator={},, eval unmatched repeated_times: {} != {}"
                    .format(train_ratio, op, len(auc_list), repeated_times))
            mean_auc = sum(auc_list) / float(len(auc_list))
            fr.write(
                'expected repeated_times: {}, actual repeated_times: {}, mean results as follows:\n'
                .format(repeated_times, len(auc_list)))
            fr.write('\t\t AUC = {}\n'.format(mean_auc))
            fr.write('details:\n')
            for repeat in range(len(auc_list)):
                fr.write('\t repeated {}/{}: AUC = {}\n'.format(
                    repeat + 1, len(auc_list), auc_list[repeat]))
    fr.write(
        '\neval case: link_prediction completed in {}s'.format(time.time() -
                                                               time_start))
    fr.close()
    logger.info(
        'eval case: link_prediction completed in {}s'.format(time.time() -
                                                             time_start))