예제 #1
0
def load_features(options, net):
    global features_dict
    features_dict = {}
    time_start = time.time()
    logger.info('\t loading embedding features...')
    start_nodes = net.get_nodes(node_type=options.eval_edge_type[0])
    target_nodes = net.get_nodes(node_type=options.eval_edge_type[1])
    id_list = start_nodes + target_nodes
    id_list, features_matrix = utils.get_vectors(utils.get_KeyedVectors(
        options.vectors_path),
                                                 id_list,
                                                 missing_rule="random")
    for idx, node_id in enumerate(id_list):
        features_dict[node_id] = features_matrix[idx]
    logger.info(
        '\t loading embedding features completed in {}s'.format(time.time() -
                                                                time_start))
예제 #2
0
def eval_once(options):
    global features_matrix, net_eval, net_except, SAMPLE_NODES, SAMPLE_RULE, METIRC, PREC_K
    if not utils.check_rebuild(options.link_prediction_path,
                               descrip='link_prediction',
                               always_rebuild=options.always_rebuild):
        return

    logger.info('eval case: link-prediction ...')
    logger.info('\t save_path: {}'.format(options.link_prediction_path))
    logger.info('\t eval_data_path: {}'.format(options.eval_data_path))
    logger.info('\t except_data_path: {}'.format(options.except_data_path))
    logger.info('\t data_format: {}'.format(options.data_format))
    logger.info('\t metrics: MAP and precise@K')
    logger.info('\t max_index for precise@K: {}'.format(
        options.precK_max_index))
    logger.info('\t similarity_metric: {}'.format(options.similarity_metric))
    logger.info('\t eval_online: {}'.format(options.eval_online))
    logger.info('\t eval_interval: {}s'.format(options.eval_interval))
    logger.info('\t sample_nodes: {}'.format(options.sample_nodes))
    logger.info('\t sample_nodes_rule: {}'.format(options.sample_nodes_rule))
    logger.info('\t repeat {} times'.format(options.repeated_times))
    logger.info('\t eval_workers: {}'.format(options.eval_workers))

    logger.info("constructing eval network ...")
    net_eval = network.construct_network(data_path=options.eval_data_path,
                                         data_format=options.data_format,
                                         print_net_info=False,
                                         isdirected=options.isdirected)
    eval_net_nodes_size = net_eval.get_nodes_size()
    eval_net_edges_size = net_eval.get_edges_size()
    logger.info("eval_net_nodes_size = {}".format(eval_net_nodes_size))
    logger.info("eval_net_edges_size = {}".format(eval_net_edges_size))

    logger.info("constructing except(train) network ...")
    net_except = network.construct_network(data_path=options.except_data_path,
                                           data_format=options.data_format,
                                           print_net_info=False,
                                           isdirected=options.isdirected)
    except_net_nodes_size = net_except.get_nodes_size()
    except_net_edges_size = net_except.get_edges_size()
    logger.info("except_net_nodes_size = {}".format(except_net_nodes_size))
    logger.info("except_net_edges_size = {}".format(except_net_edges_size))

    id_list = list(range(eval_net_nodes_size))  # must be [0,1,2,3,...]
    SAMPLE_NODES = options.sample_nodes
    SAMPLE_RULE = options.sample_nodes_rule
    METIRC = options.similarity_metric
    PREC_K = options.precK_max_index

    # loading features_matrix(already trained)
    logger.info('\t reading embedding vectors from file {}'.format(
        options.vectors_path))
    time_start = time.time()
    features_matrix = utils.get_vectors(
        utils.get_KeyedVectors(options.vectors_path), id_list)
    logger.info(
        '\t reading embedding vectors completed in {}s'.format(time.time() -
                                                               time_start))
    logger.info('total loaded nodes: {}'.format(
        np.size(features_matrix, axis=0)))
    logger.info('the embedding dimension: {}'.format(
        np.size(features_matrix, axis=1)))

    fr = open(options.link_prediction_path, 'w')
    fr.write('eval case: link-prediction ...\n')
    fr.write('\t save_path: {}\n'.format(options.link_prediction_path))
    fr.write('\t eval_data_path: {}\n'.format(options.eval_data_path))
    fr.write('\t except_data_path: {}\n'.format(options.except_data_path))
    fr.write('\t data_format: {}\n'.format(options.data_format))
    fr.write('\t metrics: MAP and precise@K\n')
    fr.write('\t max_index for precise@K: {}\n'.format(
        options.precK_max_index))
    fr.write('\t similarity_metric: {}\n'.format(options.similarity_metric))
    fr.write('\t eval_online: {}\n'.format(options.eval_online))
    fr.write('\t eval_interval: {}s\n'.format(options.eval_interval))
    fr.write('\t sample_nodes: {}\n'.format(options.sample_nodes))
    fr.write('\t sample_nodes_rule: {}\n'.format(options.sample_nodes_rule))
    fr.write('\t repeat {} times\n'.format(options.repeated_times))
    fr.write('\t eval_workers: {}\n'.format(options.eval_workers))
    fr.write("eval_net_nodes_size = {}\n".format(eval_net_nodes_size))
    fr.write("eval_net_edges_size = {}\n".format(eval_net_edges_size))
    fr.write("except_net_nodes_size = {}\n".format(except_net_nodes_size))
    fr.write("except_net_edges_size = {}\n".format(except_net_edges_size))
    fr.write('total loaded nodes: {}\n'.format(np.size(features_matrix,
                                                       axis=0)))
    fr.write('the embedding dimension: {}\n'.format(
        np.size(features_matrix, axis=1)))

    if options.sample_nodes > 0:
        if options.eval_workers > 1 and options.repeated_times > 1:
            # speed up by using multi-process
            logger.info("\t allocating repeat_times to workers ...")
            if options.repeated_times <= options.eval_workers:
                times_per_worker = [1 for _ in range(options.repeated_times)]
            else:
                div, mod = divmod(options.repeated_times, options.eval_workers)
                times_per_worker = [div for _ in range(options.eval_workers)]
                for idx in range(mod):
                    times_per_worker[idx] = times_per_worker[idx] + 1
            assert sum(
                times_per_worker
            ) == options.repeated_times, 'workers allocating failed: %d != %d' % (
                sum(times_per_worker), options.repeated_times)

            logger.info("\t using {} processes for evaling:".format(
                len(times_per_worker)))
            for idx, rep_times in enumerate(times_per_worker):
                logger.info("\t process-{}: repeat {} times".format(
                    idx, rep_times))

            ret_list = []  # [[MAP, precisionK_list], ... ]
            with ProcessPoolExecutor(
                    max_workers=options.eval_workers) as executor:
                for ret in executor.map(_sample_thread_body, times_per_worker):
                    ret_list.extend(ret)
            if len(ret_list) != options.repeated_times:
                logger.warning(
                    "warning: eval unmatched repeated_times: {} != {}".format(
                        len(ret_list), options.repeated_times))
        else:
            ret_list = _sample_thread_body(options.repeated_times)
    else:
        # no sampling, no repeat!
        ret_list = [_eval(net_eval, net_except)]  # [[MAP, precisionK_list]]

    if options.sample_nodes > 0:
        fr.write(
            'expected repeated_times: {}, actual repeated_times: {}, mean results as follows:\n'
            .format(options.repeated_times, len(ret_list)))
    else:
        fr.write(
            'due to the sample nodes = {}, so actual repeated_times = {}, results as follows:\n'
            .format(options.sample_nodes, len(ret_list)))

    mean_MAP = np.mean([ret[0] for ret in ret_list])
    mean_precisionK = np.mean([ret[1] for ret in ret_list], axis=0)

    fr.write('\t\t MAP = {}\n'.format(mean_MAP))
    for k in range(options.precK_max_index):
        if k < len(mean_precisionK):
            fr.write('\t\t precisionK_{} = {}\n'.format(
                k + 1, mean_precisionK[k]))
        else:
            fr.write('\t\t precisionK_{} = None\n'.format(k + 1))
    fr.write('details:\n')
    for repeat in range(len(ret_list)):
        fr.write('\t repeated {}/{}:\n'.format(repeat + 1, len(ret_list)))
        MAP = ret_list[repeat][0]
        precisionK_list = ret_list[repeat][1]
        fr.write('\t\t MAP = {}\n'.format(MAP))
        for k in range(options.precK_max_index):
            if k < len(precisionK_list):
                fr.write('\t\t precisionK_{} = {}\n'.format(
                    k + 1, precisionK_list[k]))
            else:
                fr.write('\t\t precisionK_{} = None\n'.format(k + 1))

    fr.write(
        '\neval case: link_prediction completed in {}s.'.format(time.time() -
                                                                time_start))
    fr.close()
    logger.info(
        'eval case: link_prediction completed in {}s.'.format(time.time() -
                                                              time_start))

    return
예제 #3
0
def eval_online(options):
    global features_matrix, net_eval, net_except, SAMPLE_NODES, SAMPLE_RULE, METIRC, PREC_K
    link_prediction_dir = os.path.split(options.link_prediction_path)[0]
    if not utils.check_rebuild(link_prediction_dir,
                               descrip='link_prediction',
                               always_rebuild=options.always_rebuild):
        return
    if not os.path.exists(link_prediction_dir):
        os.makedirs(link_prediction_dir)

    logger.info('eval case: link-prediction ...')
    logger.info('\t save_path: {}'.format(options.link_prediction_path))
    logger.info('\t eval_data_path: {}'.format(options.eval_data_path))
    logger.info('\t except_data_path: {}'.format(options.except_data_path))
    logger.info('\t data_format: {}'.format(options.data_format))
    logger.info('\t metrics: MAP and precise@K')
    logger.info('\t max_index for precise@K: {}'.format(
        options.precK_max_index))
    logger.info('\t similarity_metric: {}'.format(options.similarity_metric))
    logger.info('\t eval_online: {}'.format(options.eval_online))
    logger.info('\t eval_interval: {}s'.format(options.eval_interval))
    logger.info('\t sample_nodes: {}'.format(options.sample_nodes))
    logger.info('\t sample_nodes_rule: {}'.format(options.sample_nodes_rule))
    logger.info('\t repeat {} times'.format(options.repeated_times))
    logger.info('\t eval_workers: {}'.format(options.eval_workers))

    logger.info("constructing eval network ...")
    net_eval = network.construct_network(data_path=options.eval_data_path,
                                         data_format=options.data_format,
                                         print_net_info=False,
                                         isdirected=options.isdirected)
    eval_net_nodes_size = net_eval.get_nodes_size()
    eval_net_edges_size = net_eval.get_edges_size()
    logger.info("eval_net_nodes_size = {}".format(eval_net_nodes_size))
    logger.info("eval_net_edges_size = {}".format(eval_net_edges_size))

    logger.info("constructing except(train) network ...")
    net_except = network.construct_network(data_path=options.except_data_path,
                                           data_format=options.data_format,
                                           print_net_info=False,
                                           isdirected=options.isdirected)
    except_net_nodes_size = net_except.get_nodes_size()
    except_net_edges_size = net_except.get_edges_size()
    logger.info("except_net_nodes_size = {}".format(except_net_nodes_size))
    logger.info("except_net_edges_size = {}".format(except_net_edges_size))

    id_list = list(range(eval_net_nodes_size))  # must be [0,1,2,3,...]
    SAMPLE_NODES = options.sample_nodes
    SAMPLE_RULE = options.sample_nodes_rule
    METIRC = options.similarity_metric
    PREC_K = options.precK_max_index

    metric_prec_k_list = [1]
    decimal_number = 10
    while metric_prec_k_list[-1] < options.precK_max_index:
        if decimal_number <= options.precK_max_index:
            metric_prec_k_list.append(decimal_number)
        else:
            break
        if 2 * decimal_number <= options.precK_max_index:
            metric_prec_k_list.append(2 * decimal_number)
        else:
            break
        if 5 * decimal_number <= options.precK_max_index:
            metric_prec_k_list.append(5 * decimal_number)
        else:
            break
        decimal_number = decimal_number * 10

    if options.sample_nodes > 0:
        if options.eval_workers > 1 and options.repeated_times > 1:
            # speed up by using multi-process
            logger.info("\t allocating repeat_times to workers ...")
            if options.repeated_times <= options.eval_workers:
                times_per_worker = [1 for _ in range(options.repeated_times)]
            else:
                div, mod = divmod(options.repeated_times, options.eval_workers)
                times_per_worker = [div for _ in range(options.eval_workers)]
                for idx in range(mod):
                    times_per_worker[idx] = times_per_worker[idx] + 1
            assert sum(
                times_per_worker
            ) == options.repeated_times, 'workers allocating failed: %d != %d' % (
                sum(times_per_worker), options.repeated_times)

            logger.info("\t using {} processes for evaling:".format(
                len(times_per_worker)))
            for idx, rep_times in enumerate(times_per_worker):
                logger.info("\t process-{}: repeat {} times".format(
                    idx, rep_times))

    fr_total = open(options.link_prediction_path, 'w')
    fr_total.write('eval case: link-prediction ...\n')
    fr_total.write('\t save_path: {}\n'.format(options.link_prediction_path))
    fr_total.write('\t eval_data_path: {}\n'.format(options.eval_data_path))
    fr_total.write('\t except_data_path: {}\n'.format(
        options.except_data_path))
    fr_total.write('\t data_format: {}\n'.format(options.data_format))
    fr_total.write('\t metrics: MAP and precise@K\n')
    fr_total.write('\t max_index for precise@K: {}\n'.format(
        options.precK_max_index))
    fr_total.write('\t similarity_metric: {}\n'.format(
        options.similarity_metric))
    fr_total.write('\t eval_online: {}\n'.format(options.eval_online))
    fr_total.write('\t eval_interval: {}s\n'.format(options.eval_interval))
    fr_total.write('\t sample_nodes: {}\n'.format(options.sample_nodes))
    fr_total.write('\t sample_nodes_rule: {}\n'.format(
        options.sample_nodes_rule))
    fr_total.write('\t repeat {} times\n'.format(options.repeated_times))
    fr_total.write('\t eval_workers: {}\n'.format(options.eval_workers))
    fr_total.write("eval_net_nodes_size = {}\n".format(eval_net_nodes_size))
    fr_total.write("eval_net_edges_size = {}\n".format(eval_net_edges_size))
    fr_total.write(
        "except_net_nodes_size = {}\n".format(except_net_nodes_size))
    fr_total.write(
        "except_net_edges_size = {}\n".format(except_net_edges_size))
    fr_total.write(
        '\t results:\n=============================================================\n'
    )
    fr_total.write('finish_time\tckpt\tMAP\t')
    for v in metric_prec_k_list:
        fr_total.write('\tPr@{}'.format(v))
    fr_total.write("\n")

    last_step = 0
    summary_writer = tf.summary.FileWriter(link_prediction_dir, tf.Graph())
    summary = tf.Summary()
    summary.value.add(tag='MAP', simple_value=0.)
    for v in metric_prec_k_list:
        summary.value.add(tag='Pr_{}'.format(v), simple_value=0.)
    summary_writer.add_summary(summary, last_step)

    best_MAP = 0

    ckpt_dir = os.path.join(os.path.split(options.vectors_path)[0], 'ckpt')
    ckpt = tf.train.get_checkpoint_state(ckpt_dir)
    while (not (ckpt and ckpt.model_checkpoint_path)):
        logger.info("\t model and vectors not exist, waiting ...")
        time.sleep(options.eval_interval)
        ckpt = tf.train.get_checkpoint_state(ckpt_dir)

    reading = options.vectors_path + ".reading_link_prediction"
    writing = options.vectors_path + ".writing"
    while (options.eval_online):
        while True:
            ckpt = tf.train.get_checkpoint_state(ckpt_dir)
            cur_step = int(
                ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
            if cur_step <= last_step or (not os.path.exists(
                    options.vectors_path)) or os.path.exists(writing):
                if os.path.exists(
                        os.path.join(
                            os.path.split(options.vectors_path)[0],
                            "RUN_SUCCESS")):
                    return
                time.sleep(options.eval_interval)
                continue
            # ready for reading
            logger.info("\t declare for reading ...")
            open(reading, "w")  # declare
            time.sleep(30)
            ckpt = tf.train.get_checkpoint_state(ckpt_dir)
            cur_step = int(
                ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
            if cur_step <= last_step or (not os.path.exists(
                    options.vectors_path)) or os.path.exists(writing):
                os.remove(reading)  # undeclare
                logger.info("\t confliction! undeclare and waiting ...")
                time.sleep(options.eval_interval)
                continue

            break
        logger.info("\t eval ckpt-{}.......".format(cur_step))
        # loading features_matrix(already trained)
        logger.info('\t reading embedding vectors from file {}'.format(
            options.vectors_path))
        time_start = time.time()
        features_matrix = utils.get_vectors(
            utils.get_KeyedVectors(options.vectors_path), id_list)
        os.remove(reading)
        logger.info("\t done for reading ...")
        logger.info('\t reading embedding vectors completed in {}s'.format(
            time.time() - time_start))
        logger.info('total loaded nodes: {}'.format(
            np.size(features_matrix, axis=0)))
        logger.info('the embedding dimension: {}'.format(
            np.size(features_matrix, axis=1)))

        #
        fr = open(options.link_prediction_path + '.{}'.format(cur_step), 'w')
        fr.write('eval case: link-prediction ...\n')
        fr.write('\t save_path: {}\n'.format(options.link_prediction_path))
        fr.write('\t eval_data_path: {}\n'.format(options.eval_data_path))
        fr.write('\t except_data_path: {}\n'.format(options.except_data_path))
        fr.write('\t data_format: {}\n'.format(options.data_format))
        fr.write('\t metrics: MAP and precise@K\n')
        fr.write('\t max_index for precise@K: {}\n'.format(
            options.precK_max_index))
        fr.write('\t similarity_metric: {}\n'.format(
            options.similarity_metric))
        fr.write('\t eval_online: {}\n'.format(options.eval_online))
        fr.write('\t eval_interval: {}s\n'.format(options.eval_interval))
        fr.write('\t sample_nodes: {}\n'.format(options.sample_nodes))
        fr.write('\t sample_nodes_rule: {}\n'.format(
            options.sample_nodes_rule))
        fr.write('\t repeat {} times\n'.format(options.repeated_times))
        fr.write('\t eval_workers: {}\n'.format(options.eval_workers))
        fr.write("eval_net_nodes_size = {}\n".format(eval_net_nodes_size))
        fr.write("eval_net_edges_size = {}\n".format(eval_net_edges_size))
        fr.write("except_net_nodes_size = {}\n".format(except_net_nodes_size))
        fr.write("except_net_edges_size = {}\n".format(except_net_edges_size))
        fr.write('total loaded nodes: {}\n'.format(
            np.size(features_matrix, axis=0)))
        fr.write('the embedding dimension: {}\n'.format(
            np.size(features_matrix, axis=1)))

        if options.sample_nodes > 0:
            if options.eval_workers > 1 and options.repeated_times > 1:
                # speed up by using multi-process
                ret_list = []  # [[MAP, precisionK_list], ... ]
                with ProcessPoolExecutor(
                        max_workers=options.eval_workers) as executor:
                    for ret in executor.map(_sample_thread_body,
                                            times_per_worker):
                        ret_list.extend(ret)
                if len(ret_list) != options.repeated_times:
                    logger.warning(
                        "warning: eval unmatched repeated_times: {} != {}".
                        format(len(ret_list), options.repeated_times))
            else:
                ret_list = _sample_thread_body(options.repeated_times)
        else:
            # no sampling, no repeat!
            ret_list = [_eval(net_eval,
                              net_except)]  # [[MAP, precisionK_list]]

        fr_total.write('%s ckpt-%-9d: ' % (time.strftime(
            '%Y-%m-%d %H:%M:%S', time.localtime(time.time())), cur_step))
        summary = tf.Summary()

        if options.sample_nodes > 0:
            fr.write(
                'expected repeated_times: {}, actual repeated_times: {}, mean results as follows:\n'
                .format(options.repeated_times, len(ret_list)))
        else:
            fr.write(
                'due to the sample nodes = {}, so actual repeated_times = {}, results as follows:\n'
                .format(options.sample_nodes, len(ret_list)))

        mean_MAP = np.mean([ret[0] for ret in ret_list])
        mean_precisionK = np.mean([ret[1] for ret in ret_list], axis=0)

        fr.write('\t\t MAP = {}\n'.format(mean_MAP))
        for k in range(options.precK_max_index):
            if k < len(mean_precisionK):
                fr.write('\t\t precisionK_{} = {}\n'.format(
                    k + 1, mean_precisionK[k]))
            else:
                fr.write('\t\t precisionK_{} = None\n'.format(k + 1))
        fr.write('details:\n')
        for repeat in range(len(ret_list)):
            fr.write('\t repeated {}/{}:\n'.format(repeat + 1, len(ret_list)))
            MAP = ret_list[repeat][0]
            precisionK_list = ret_list[repeat][1]
            fr.write('\t\t MAP = {}\n'.format(MAP))
            for k in range(options.precK_max_index):
                if k < len(precisionK_list):
                    fr.write('\t\t precisionK_{} = {}\n'.format(
                        k + 1, precisionK_list[k]))
                else:
                    fr.write('\t\t precisionK_{} = None\n'.format(k + 1))

        fr.write('\neval case: link_prediction completed in {}s.'.format(
            time.time() - time_start))
        fr.close()

        fr_total.write('%.4f' % mean_MAP)
        summary.value.add(tag='MAP', simple_value=mean_MAP)
        for v in metric_prec_k_list:
            fr_total.write('\t%.4f' % mean_precisionK[v - 1])
            summary.value.add(tag='Pr_{}'.format(v),
                              simple_value=mean_precisionK[v - 1])
        fr_total.write("\n")
        fr_total.flush()
        summary_writer.add_summary(summary, cur_step)
        summary_writer.flush()
        logger.info(
            'eval case: ret_list completed in {}s.\n================================='
            .format(time.time() - time_start))

        # copy ckpt-files according to last mean_Micro_F1 (0.9 ratio).
        if mean_MAP > best_MAP:
            best_MAP = mean_MAP

            ckptIsExists = os.path.exists(
                os.path.join(ckpt_dir, 'model.ckpt-%d.index' % cur_step))
            if ckptIsExists:
                fr_best = open(
                    os.path.join(link_prediction_dir, 'best_ckpt.info'), 'w')
            else:
                fr_best = open(
                    os.path.join(link_prediction_dir, 'best_ckpt.info'), 'a')
                fr_best.write(
                    "Note:the model.ckpt-best is the remainings of last best_ckpt!\n"
                    "the current best_ckpt model is loss, but the result is:\n"
                )
            fr_best.write("best_MAP: {}\n".format(best_MAP))
            fr_best.write("best_ckpt: ckpt-{}\n".format(cur_step))
            fr_best.close()

            if ckptIsExists:
                sourceFile = os.path.join(
                    ckpt_dir, 'model.ckpt-%d.data-00000-of-00001' % cur_step)
                targetFile = os.path.join(
                    link_prediction_dir, 'model.ckpt-best.data-00000-of-00001')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)
                sourceFile = os.path.join(ckpt_dir,
                                          'model.ckpt-%d.index' % cur_step)
                targetFile = os.path.join(link_prediction_dir,
                                          'model.ckpt-best.index')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)
                sourceFile = os.path.join(ckpt_dir,
                                          'model.ckpt-%d.meta' % cur_step)
                targetFile = os.path.join(link_prediction_dir,
                                          'model.ckpt-best.meta')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)

        last_step = cur_step

    fr_total.close()
    summary_writer.close()
예제 #4
0
def eval_once(options):
    global features_matrix, labels_matrix, LABEL_SIZE
    if not utils.check_rebuild(options.cluster_path,
                               descrip='cluster',
                               always_rebuild=options.always_rebuild):
        return
    logger.info('eval case: cluster...')
    logger.info('\t save_path: {}'.format(options.cluster_path))
    logger.info('\t cluster: kmeans')
    logger.info('\t multilabel_rule: {}'.format(options.multilabel_rule))
    logger.info('\t eval_online: {}'.format(options.eval_online))
    logger.info('\t eval_workers: {}'.format(options.eval_workers))
    logger.info('\t repeat {} times'.format(options.repeated_times))

    logger.info('\t reading labeled data from file {}'.format(
        options.label_path))
    time_start = time.time()
    id_list, labels_list = utils.get_labeled_data(
        options.label_path, multilabel_rule=options.multilabel_rule)
    features_matrix, labels_list = utils.get_vectors(
        utils.get_KeyedVectors(options.vectors_path), id_list, labels_list)
    labels_matrix = np.array([item[0] for item in labels_list])
    LABEL_SIZE = options.label_size
    logger.info('\t reading labeled data completed in {}s'.format(time.time() -
                                                                  time_start))
    logger.info('\t total labeled data size: {}'.format(
        np.size(features_matrix, axis=0)))
    logger.info('\t total labels size: {}'.format(options.label_size))

    # cluster
    fr = open(options.cluster_path, 'w')
    fr.write('eval case: cluster...\n')
    fr.write('\t save_path: {}\n'.format(options.cluster_path))
    fr.write('\t cluster: kmeans\n')
    fr.write('\t multilabel_rule: {}\n'.format(options.multilabel_rule))
    fr.write('\t eval_online: {}\n'.format(options.eval_online))
    fr.write('\t eval_workers: {}\n'.format(options.eval_workers))
    fr.write('\t repeat {} times\n'.format(options.repeated_times))
    fr.write('\t total labeled data size: {}\n'.format(
        np.size(features_matrix, axis=0)))
    fr.write('\t total labels size: {}\n'.format(options.label_size))
    for i in range(options.label_size):
        fr.write('\t\t label {}: {}\n'.format(i, np.sum(labels_matrix == i)))

    if options.eval_workers > 1 and options.repeated_times > 1:
        # speed up by using multi-process
        logger.info("\t allocating repeat_times to workers ...")
        if options.repeated_times <= options.eval_workers:
            times_per_worker = [1 for _ in range(options.repeated_times)]
        else:
            div, mod = divmod(options.repeated_times, options.eval_workers)
            times_per_worker = [div for _ in range(options.eval_workers)]
            for idx in range(mod):
                times_per_worker[idx] = times_per_worker[idx] + 1
        assert sum(
            times_per_worker
        ) == options.repeated_times, 'workers allocating failed: %d != %d' % (
            sum(times_per_worker), options.repeated_times)

        logger.info("\t using {} processes for evaling:".format(
            len(times_per_worker)))
        for idx, rep_times in enumerate(times_per_worker):
            logger.info("\t process-{}: repeat {} times".format(
                idx, rep_times))

        try:
            nmi_list = []  # (train_ratio, macro, micro)
            with ProcessPoolExecutor(
                    max_workers=options.eval_workers) as executor:
                for ret in executor.map(_cluster_thread_body,
                                        times_per_worker):
                    nmi_list.extend(ret)
        except:
            nmi_list = []  # (train_ratio, macro, micro)
            with ProcessPoolExecutor(
                    max_workers=options.eval_workers) as executor:
                for ret in executor.map(_cluster_thread_body,
                                        times_per_worker):
                    nmi_list.extend(ret)

        if len(nmi_list) != options.repeated_times:
            logger.warning(
                "warning: eval unmatched repeated_times: {} != {}".format(
                    len(nmi_list), options.repeated_times))
    else:
        try:
            nmi_list = _cluster_thread_body(options.repeated_times)
        except:
            nmi_list = _cluster_thread_body(options.repeated_times)

    mean_nmi = sum(nmi_list) / float(len(nmi_list))
    fr.write(
        'expected repeated_times: {}, actual repeated_times: {}, mean results as follows:\n'
        .format(options.repeated_times, len(nmi_list)))
    fr.write('\t\t NMI = {}\n'.format(mean_nmi))
    fr.write('details:\n')
    for repeat in range(len(nmi_list)):
        fr.write('\t repeated {}/{}: NMI = {}\n'.format(
            repeat + 1, len(nmi_list), nmi_list[repeat]))
    fr.write('\neval case: cluster completed in {}s.'.format(time.time() -
                                                             time_start))
    fr.close()
    logger.info('eval case: cluster completed in {}s.'.format(time.time() -
                                                              time_start))

    return
예제 #5
0
def eval_online(options):
    global features_matrix, labels_matrix, LABEL_SIZE
    cluster_dir = os.path.split(options.cluster_path)[0]
    if not utils.check_rebuild(cluster_dir,
                               descrip='cluster',
                               always_rebuild=options.always_rebuild):
        return
    if not os.path.exists(cluster_dir):
        os.makedirs(cluster_dir)

    logger.info('eval case: cluster...')
    logger.info('\t save_path: {}'.format(options.cluster_path))
    logger.info('\t cluster: kmeans')
    logger.info('\t multilabel_rule: {}'.format(options.multilabel_rule))
    logger.info('\t eval_online: {}'.format(options.eval_online))
    logger.info('\t eval_interval: {}s'.format(options.eval_interval))
    logger.info('\t eval_workers: {}'.format(options.eval_workers))
    logger.info('\t repeat {} times'.format(options.repeated_times))
    logger.info('\t total labels size: {}'.format(options.label_size))

    if options.eval_workers > 1 and options.repeated_times > 1:
        # speed up by using multi-process
        logger.info("\t allocating repeat_times to workers ...")
        if options.repeated_times <= options.eval_workers:
            times_per_worker = [1 for _ in range(options.repeated_times)]
        else:
            div, mod = divmod(options.repeated_times, options.eval_workers)
            times_per_worker = [div for _ in range(options.eval_workers)]
            for idx in range(mod):
                times_per_worker[idx] = times_per_worker[idx] + 1
        assert sum(
            times_per_worker
        ) == options.repeated_times, 'workers allocating failed: %d != %d' % (
            sum(times_per_worker), options.repeated_times)

        logger.info("\t using {} processes for evaling:".format(
            len(times_per_worker)))
        for idx, rep_times in enumerate(times_per_worker):
            logger.info("\t process-{}: repeat {} times".format(
                idx, rep_times))

    fr_total = open(options.cluster_path, 'w')
    fr_total.write('eval case: cluster...\n')
    fr_total.write('\t save_dir: {}\n'.format(cluster_dir))
    fr_total.write('\t cluster: kmeans\n')
    fr_total.write('\t multilabel_rule: {}\n'.format(options.multilabel_rule))
    fr_total.write('\t eval_online: {}\n'.format(options.eval_online))
    fr_total.write('\t eval_interval: {}s\n'.format(options.eval_interval))
    fr_total.write('\t eval_workers: {}\n'.format(options.eval_workers))
    fr_total.write('\t repeat {} times\n'.format(options.repeated_times))
    fr_total.write('\t total labels size: {}\n'.format(options.label_size))
    fr_total.write(
        '\t results(NMI):\n=============================================================\n'
    )
    fr_total.write('finish_time\tckpt\tNMI\n')

    logger.info('\t reading labeled data from file {}'.format(
        options.label_path))
    time_start = time.time()
    id_list, labels_list = utils.get_labeled_data(
        options.label_path, multilabel_rule=options.multilabel_rule)
    logger.info('\t reading labeled data completed in {}s'.format(time.time() -
                                                                  time_start))

    last_step = 0
    summary_writer = tf.summary.FileWriter(cluster_dir, tf.Graph())
    summary = tf.Summary()
    summary.value.add(tag='nmi', simple_value=0.)
    summary_writer.add_summary(summary, last_step)

    best_nmi = 0

    ckpt_dir = os.path.join(os.path.split(options.vectors_path)[0], 'ckpt')
    ckpt = tf.train.get_checkpoint_state(ckpt_dir)
    while (not (ckpt and ckpt.model_checkpoint_path)):
        logger.info("\t model and vectors not exist, waiting ...")
        time.sleep(options.eval_interval)
        ckpt = tf.train.get_checkpoint_state(ckpt_dir)

    reading = options.vectors_path + ".reading_cluster"
    writing = options.vectors_path + ".writing"
    while (options.eval_online):
        while True:
            ckpt = tf.train.get_checkpoint_state(ckpt_dir)
            cur_step = int(
                ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
            if cur_step <= last_step or (not os.path.exists(
                    options.vectors_path)) or os.path.exists(writing):
                if os.path.exists(
                        os.path.join(
                            os.path.split(options.vectors_path)[0],
                            "RUN_SUCCESS")):
                    return
                time.sleep(options.eval_interval)
                continue
            # ready for reading
            logger.info("\t declare for reading ...")
            open(reading, "w")  # declare
            time.sleep(30)
            ckpt = tf.train.get_checkpoint_state(ckpt_dir)
            cur_step = int(
                ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
            if cur_step <= last_step or (not os.path.exists(
                    options.vectors_path)) or os.path.exists(writing):
                os.remove(reading)  # undeclare
                logger.info("\t confliction! undeclare and waiting ...")
                time.sleep(options.eval_interval)
                continue

            break
        logger.info("\t eval ckpt-{}.......".format(cur_step))
        time_start = time.time()
        logger.info('\t reading embedding vectors from file {}'.format(
            options.vectors_path))
        features_matrix, labels_list = utils.get_vectors(
            utils.get_KeyedVectors(options.vectors_path), id_list, labels_list)
        os.remove(reading)  # synchrolock for multi-process
        logger.info("\t done for reading ...")
        labels_matrix = np.array([item[0] for item in labels_list])
        LABEL_SIZE = options.label_size
        logger.info(
            '\t reading labeled data completed in {}s'.format(time.time() -
                                                              time_start))
        logger.info('\t total labeled data size: {}'.format(
            np.size(features_matrix, axis=0)))
        logger.info('\t total labels size: {}'.format(options.label_size))

        # cluster
        fr = open(options.cluster_path + '.{}'.format(cur_step), 'w')
        fr.write('eval case: cluster...\n')
        fr.write('\t cluster: kmeans\n')
        fr.write('\t multilabel_rule: {}\n'.format(options.multilabel_rule))
        fr.write('\t eval_workers: {}\n'.format(options.eval_workers))
        fr.write('\t repeat {} times\n'.format(options.repeated_times))
        fr.write('\t total labeled data size: {}\n'.format(
            np.size(features_matrix, axis=0)))
        fr.write('\t total labels size: {}\n'.format(options.label_size))
        for i in range(options.label_size):
            fr.write('\t\t label {}: {}\n'.format(i,
                                                  np.sum(labels_matrix == i)))

        if options.eval_workers > 1 and options.repeated_times > 1:
            # speed up by using multi-process
            fr.write("\t using {} processes for evaling:\n".format(
                len(times_per_worker)))
            for idx, rep_times in enumerate(times_per_worker):
                fr.write("\t process-{}: repeat {} times\n".format(
                    idx, rep_times))

            try:
                nmi_list = []
                with ProcessPoolExecutor(
                        max_workers=options.eval_workers) as executor:
                    for ret in executor.map(_cluster_thread_body,
                                            times_per_worker):
                        nmi_list.extend(ret)
            except:
                nmi_list = []
                with ProcessPoolExecutor(
                        max_workers=options.eval_workers) as executor:
                    for ret in executor.map(_cluster_thread_body,
                                            times_per_worker):
                        nmi_list.extend(ret)
            if len(nmi_list) != options.repeated_times:
                logger.warning(
                    "warning: eval unmatched repeated_times: {} != {}".format(
                        len(nmi_list), options.repeated_times))
        else:
            try:
                nmi_list = _cluster_thread_body(options.repeated_times)
            except:
                nmi_list = _cluster_thread_body(options.repeated_times)

        fr_total.write('%s ckpt-%-9d: ' % (time.strftime(
            '%Y-%m-%d %H:%M:%S', time.localtime(time.time())), cur_step))
        summary = tf.Summary()

        mean_nmi = sum(nmi_list) / float(len(nmi_list))
        fr.write(
            'expected repeated_times: {}, actual repeated_times: {}, mean results as follows:\n'
            .format(options.repeated_times, len(nmi_list)))
        fr.write('\t\t NMI = {}\n'.format(mean_nmi))
        fr.write('details:\n')
        for repeat in range(len(nmi_list)):
            fr.write('\t repeated {}/{}: NMI = {}\n'.format(
                repeat + 1, len(nmi_list), nmi_list[repeat]))
        fr.write('\neval case: cluster completed in {}s\n'.format(time.time() -
                                                                  time_start))
        fr.close()

        # fr_total.write('%.4f\n' % mean_nmi)
        fr_total.write('{}\n'.format(mean_nmi))
        fr_total.flush()
        summary.value.add(tag='nmi', simple_value=mean_nmi)
        summary_writer.add_summary(summary, cur_step)
        summary_writer.flush()
        logger.info(
            'cluster completed in {}s\n================================='.
            format(time.time() - time_start))

        # copy ckpt-files according to last mean_Micro_F1 (0.9 ratio).
        if mean_nmi > best_nmi:
            best_nmi = mean_nmi

            ckptIsExists = os.path.exists(
                os.path.join(ckpt_dir, 'model.ckpt-%d.index' % cur_step))
            if ckptIsExists:
                fr_best = open(os.path.join(cluster_dir, 'best_ckpt.info'),
                               'w')
            else:
                fr_best = open(os.path.join(cluster_dir, 'best_ckpt.info'),
                               'a')
                fr_best.write(
                    "Note:the model.ckpt-best is the remainings of last best_ckpt!\n"
                    "the current best_ckpt model is loss, but the result is:\n"
                )
            fr_best.write("best_nmi: {}\n".format(best_nmi))
            fr_best.write("best_ckpt: ckpt-{}\n".format(cur_step))
            fr_best.close()

            if ckptIsExists:
                sourceFile = os.path.join(
                    ckpt_dir, 'model.ckpt-%d.data-00000-of-00001' % cur_step)
                targetFile = os.path.join(
                    cluster_dir, 'model.ckpt-best.data-00000-of-00001')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)
                sourceFile = os.path.join(ckpt_dir,
                                          'model.ckpt-%d.index' % cur_step)
                targetFile = os.path.join(cluster_dir, 'model.ckpt-best.index')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)
                sourceFile = os.path.join(ckpt_dir,
                                          'model.ckpt-%d.meta' % cur_step)
                targetFile = os.path.join(cluster_dir, 'model.ckpt-best.meta')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)

        last_step = cur_step

    fr_total.close()
    summary_writer.close()

    return
예제 #6
0
def eval_once(options):
    global features_matrix, labels_matrix
    if not utils.check_rebuild(options.classify_path,
                               descrip='classify',
                               always_rebuild=options.always_rebuild):
        return
    logger.info('eval case: classify...')
    logger.info('\t save_path: {}'.format(options.classify_path))
    logger.info('\t classifier: LogisticRegression')
    logger.info('\t eval_online: {}'.format(options.eval_online))
    logger.info('\t eval_workers: {}'.format(options.eval_workers))

    logger.info('\t reading labeled data from file {}'.format(
        options.label_path))
    time_start = time.time()
    id_list, labels_list = utils.get_labeled_data(options.label_path)
    features_matrix, labels_list = utils.get_vectors(
        utils.get_KeyedVectors(options.vectors_path), id_list, labels_list)
    mlb = MultiLabelBinarizer(range(options.label_size))
    labels_matrix = mlb.fit_transform(labels_list)
    logger.info('\t reading labeled data completed in {}s'.format(time.time() -
                                                                  time_start))
    logger.info('\t total labeled data size: {}'.format(
        np.size(features_matrix, axis=0)))
    logger.info('\t total labels size: {}'.format(options.label_size))
    # repeated 10times
    repeated_times = options.repeated_times
    # split ratio
    if options.train_ratio > 0:
        train_ratio_list = [options.train_ratio]
    else:
        train_ratio_list = [v / 10.0 for v in range(9, 0, -1)]

    logger.info('\t repeat {} times for each train_ratio in {}'.format(
        repeated_times, train_ratio_list))

    train_ratio_fulllist = [
        train_ratio for train_ratio in train_ratio_list
        for _ in range(repeated_times)
    ]

    # classify
    fr = open(options.classify_path, 'w')
    fr.write('eval case: classify...\n')
    fr.write('\t save_path: {}\n'.format(options.classify_path))
    fr.write('\t classifier: LogisticRegression\n')
    fr.write('\t eval_online: {}\n'.format(options.eval_online))
    fr.write('\t eval_workers: {}\n'.format(options.eval_workers))
    fr.write('\t repeat {} times for each train_ratio in {}\n'.format(
        repeated_times, train_ratio_list))
    fr.write('\t total labeled data size: {}\n'.format(
        np.size(features_matrix, axis=0)))
    fr.write('\t total labels size: {}\n'.format(options.label_size))
    for i in range(options.label_size):
        fr.write('\t\t label {}: {}\n'.format(i, np.sum(labels_matrix[:, i])))

    if options.eval_workers > 1 and len(train_ratio_fulllist) > 1:
        # speed up by using multi-process
        if len(train_ratio_fulllist) <= options.eval_workers:
            train_ratios_per_worker = [[train_ratio]
                                       for train_ratio in train_ratio_fulllist]
        else:
            div, mod = divmod(len(train_ratio_fulllist), options.eval_workers)
            train_ratios_per_worker = [
                train_ratio_fulllist[div * i:div * (i + 1)]
                for i in range(options.eval_workers)
            ]
            for idx, train_ratio in enumerate(
                    train_ratio_fulllist[div * options.eval_workers:]):
                train_ratios_per_worker[len(train_ratios_per_worker) - 1 -
                                        idx].append(train_ratio)
        logger.info("\t using {} processes for evaling:".format(
            len(train_ratios_per_worker)))
        for idx, train_ratios in enumerate(train_ratios_per_worker):
            logger.info("\t process-{}: {}".format(idx, train_ratios))
        ret_list = []  # (train_ratio, macro, micro)
        with ProcessPoolExecutor(max_workers=options.eval_workers) as executor:
            for ret in executor.map(_classify_thread_body,
                                    train_ratios_per_worker):
                ret_list.extend(ret)
    else:
        ret_list = _classify_thread_body(train_ratio_fulllist)

    ret_dict = {}
    for ret in ret_list:
        if ret[0] in ret_dict:
            ret_dict[ret[0]][0].append(ret[1])
            ret_dict[ret[0]][1].append(ret[2])
        else:
            ret_dict[ret[0]] = [[ret[1]], [ret[2]]]

    for train_ratio, macro_micro in sorted(ret_dict.items(),
                                           key=lambda item: item[0]):
        fr.write('\n' + '-' * 20 + '\n' +
                 'train_ratio = {}\n'.format(train_ratio))
        Macro_F1_list = macro_micro[0]
        Micro_F1_list = macro_micro[1]
        if len(Macro_F1_list) != repeated_times:
            logger.warning(
                "warning: train_ratio = {} eval unmatched repeated_times: {} != {}"
                .format(train_ratio, len(Macro_F1_list), repeated_times))
        mean_Macro_F1 = sum(Macro_F1_list) / float(len(Macro_F1_list))
        mean_Micro_F1 = sum(Micro_F1_list) / float(len(Micro_F1_list))
        fr.write(
            'expected repeated_times: {}, actual repeated_times: {}, mean results as follows:\n'
            .format(repeated_times, len(Macro_F1_list)))
        fr.write('\t\t Macro_F1 = {}\n'.format(mean_Macro_F1))
        fr.write('\t\t Micro_F1 = {}\n'.format(mean_Micro_F1))
        fr.write('details:\n')
        for repeat in range(len(Macro_F1_list)):
            fr.write(
                '\t repeated {}/{}: Macro_F1 = {}, Micro_F1 = {}\n'.format(
                    repeat + 1, len(Macro_F1_list), Macro_F1_list[repeat],
                    Micro_F1_list[repeat]))
    fr.write('\neval case: classify completed in {}s'.format(time.time() -
                                                             time_start))
    fr.close()
    logger.info('eval case: classify completed in {}s'.format(time.time() -
                                                              time_start))
예제 #7
0
def eval_online(options):
    global features_matrix, labels_matrix
    classify_dir = os.path.split(options.classify_path)[0]
    if not utils.check_rebuild(classify_dir,
                               descrip='classify',
                               always_rebuild=options.always_rebuild):
        return
    if not os.path.exists(classify_dir):
        os.makedirs(classify_dir)
    logger.info('eval case: classify...')
    logger.info('\t save_dir: {}'.format(classify_dir))
    logger.info('\t classifier: LogisticRegression')
    logger.info('\t eval_online: {}'.format(options.eval_online))
    logger.info('\t eval_interval: {}s'.format(options.eval_interval))
    logger.info('\t eval_workers: {}'.format(options.eval_workers))
    logger.info('\t total labels size: {}'.format(options.label_size))

    # repeated 10times
    repeated_times = options.repeated_times
    # split ratio
    if options.train_ratio > 0:
        train_ratio_list = [options.train_ratio]
    else:
        train_ratio_list = [v / 10.0 for v in range(9, 0, -1)]

    logger.info('\t repeat {} times for each train_ratio in {}'.format(
        repeated_times, train_ratio_list))

    train_ratio_fulllist = [
        train_ratio for train_ratio in train_ratio_list
        for _ in range(repeated_times)
    ]
    if options.eval_workers > 1 and len(train_ratio_fulllist) > 1:
        # speed up by using multi-process
        if len(train_ratio_fulllist) <= options.eval_workers:
            train_ratios_per_worker = [[train_ratio]
                                       for train_ratio in train_ratio_fulllist]
        else:
            div, mod = divmod(len(train_ratio_fulllist), options.eval_workers)
            train_ratios_per_worker = [
                train_ratio_fulllist[div * i:div * (i + 1)]
                for i in range(options.eval_workers)
            ]
            for idx, train_ratio in enumerate(
                    train_ratio_fulllist[div * options.eval_workers:]):
                train_ratios_per_worker[len(train_ratios_per_worker) - 1 -
                                        idx].append(train_ratio)
        logger.info("\t using {} processes for evaling:".format(
            len(train_ratios_per_worker)))
        for idx, train_ratios in enumerate(train_ratios_per_worker):
            logger.info("\t process-{}: {}".format(idx, train_ratios))

    fr_total = open(options.classify_path, 'w')
    fr_total.write('eval case: classify...\n')
    fr_total.write('\t save_dir: {}\n'.format(classify_dir))
    fr_total.write('\t classifier: LogisticRegression\n')
    fr_total.write('\t eval_online: {}\n'.format(options.eval_online))
    fr_total.write('\t eval_interval: {}s\n'.format(options.eval_interval))
    fr_total.write('\t eval_workers: {}\n'.format(options.eval_workers))
    fr_total.write('\t repeat {} times for each train_ratio in {}\n'.format(
        repeated_times, train_ratio_list))
    fr_total.write('\t total labels size: {}\n'.format(options.label_size))
    fr_total.write(
        '\t results(Macro_F1,Micro_F1):\n=============================================================\n'
    )
    fr_total.write(
        'finish_time\tckpt\t\t0.1\t0.2\t0.3\t0.4\t0.5\t0.6\t0.7\t0.8\t0.9\n')

    time_start = time.time()
    logger.info('\t reading labeled data from file {}'.format(
        options.label_path))
    id_list_totoal, labels_list_total = utils.get_labeled_data(
        options.label_path)
    logger.info('\t reading labeled data completed in {}s'.format(time.time() -
                                                                  time_start))

    last_step = 0
    summary_writer = tf.summary.FileWriter(classify_dir, tf.Graph())
    summary = tf.Summary()
    for train_ratio in train_ratio_list:
        summary.value.add(tag='macro_train_{}'.format(train_ratio),
                          simple_value=0.)
        summary.value.add(tag='micro_train_{}'.format(train_ratio),
                          simple_value=0.)
    summary_writer.add_summary(summary, last_step)

    best_micro = 0

    ckpt_dir = os.path.join(os.path.split(options.vectors_path)[0], 'ckpt')
    ckpt = tf.train.get_checkpoint_state(ckpt_dir)
    while (not (ckpt and ckpt.model_checkpoint_path)):
        logger.info("\t model and vectors not exist, waiting ...")
        time.sleep(options.eval_interval)
        ckpt = tf.train.get_checkpoint_state(ckpt_dir)

    reading = options.vectors_path + ".reading_classify"
    writing = options.vectors_path + ".writing"
    while (options.eval_online):
        while True:
            ckpt = tf.train.get_checkpoint_state(ckpt_dir)
            cur_step = int(
                ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
            ## synchrolock for multi-process:
            # while(not(cur_step > last_step and os.path.exists(options.vectors_path) and
            #                       time.time() - os.stat(options.vectors_path).st_mtime > 200)):
            #     time.sleep(options.eval_interval)
            #     ckpt = tf.train.get_checkpoint_state(ckpt_dir)
            #     cur_step = int(ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
            # os.utime(options.vectors_path, None)
            if cur_step <= last_step or (not os.path.exists(
                    options.vectors_path)) or os.path.exists(writing):
                if os.path.exists(
                        os.path.join(
                            os.path.split(options.vectors_path)[0],
                            "RUN_SUCCESS")):
                    return
                time.sleep(options.eval_interval)
                continue
            # ready for reading
            logger.info("\t declare for reading ...")
            open(reading, "w")  # declare
            time.sleep(30)
            ckpt = tf.train.get_checkpoint_state(ckpt_dir)
            cur_step = int(
                ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
            if cur_step <= last_step or (not os.path.exists(
                    options.vectors_path)) or os.path.exists(writing):
                os.remove(reading)  # undeclare
                logger.info("\t confliction! undeclare and waiting ...")
                time.sleep(options.eval_interval)
                continue

            break
        logger.info("\t eval ckpt-{}.......".format(cur_step))
        time_start = time.time()
        logger.info('\t reading embedding vectors from file {}'.format(
            options.vectors_path))
        features_matrix, labels_list = utils.get_vectors(
            utils.get_KeyedVectors(options.vectors_path), id_list_totoal,
            labels_list_total)
        os.remove(reading)  # synchrolock for multi-process
        logger.info("\t done for reading ...")
        mlb = MultiLabelBinarizer(range(options.label_size))
        labels_matrix = mlb.fit_transform(labels_list)
        logger.info('\t reading embedding vectors completed in {}s'.format(
            time.time() - time_start))
        logger.info('\t total labeled data size: {}'.format(
            np.size(features_matrix, axis=0)))
        logger.info('\t total labels size: {}'.format(options.label_size))

        # classify
        fr = open(options.classify_path + '.{}'.format(cur_step), 'w')
        fr.write('eval case: classify...\n')
        fr.write('\t classifier: LogisticRegression\n')
        fr.write('\t eval_workers: {}\n'.format(options.eval_workers))
        fr.write('\t repeat {} times for each train_ratio in {}\n'.format(
            repeated_times, train_ratio_list))
        fr.write('\t total labeled data size: {}\n'.format(
            np.size(features_matrix, axis=0)))
        fr.write('\t total labels size: {}\n'.format(options.label_size))
        for i in range(options.label_size):
            fr.write('\t\t label {}: {}\n'.format(i, np.sum(labels_matrix[:,
                                                                          i])))

        if options.eval_workers > 1 and len(train_ratio_fulllist) > 1:
            fr.write("\t using {} processes for evaling:\n".format(
                len(train_ratios_per_worker)))
            for idx, train_ratios in enumerate(train_ratios_per_worker):
                fr.write("\t process-{}: {}\n".format(idx, train_ratios))
            ret_list = []  # (train_ratio, macro, micro)
            with ProcessPoolExecutor(
                    max_workers=options.eval_workers) as executor:
                for ret in executor.map(_classify_thread_body,
                                        train_ratios_per_worker):
                    ret_list.extend(ret)
        else:
            ret_list = _classify_thread_body(train_ratio_fulllist)

        fr_total.write('%s ckpt-%-9d: ' % (time.strftime(
            '%Y-%m-%d %H:%M:%S', time.localtime(time.time())), cur_step))
        summary = tf.Summary()

        ret_dict = {}
        for ret in ret_list:
            if ret[0] in ret_dict:
                ret_dict[ret[0]][0].append(ret[1])
                ret_dict[ret[0]][1].append(ret[2])
            else:
                ret_dict[ret[0]] = [[ret[1]], [ret[2]]]

        for train_ratio, macro_micro in sorted(ret_dict.items(),
                                               key=lambda item: item[0]):
            fr.write('\n' + '-' * 20 + '\n' +
                     'train_ratio = {}\n'.format(train_ratio))
            Macro_F1_list = macro_micro[0]
            Micro_F1_list = macro_micro[1]
            if len(Macro_F1_list) != repeated_times:
                logger.warning(
                    "warning: train_ratio = {} eval unmatched repeated_times: {} != {}"
                    .format(train_ratio, len(Macro_F1_list), repeated_times))
            mean_Macro_F1 = sum(Macro_F1_list) / float(len(Macro_F1_list))
            mean_Micro_F1 = sum(Micro_F1_list) / float(len(Micro_F1_list))
            fr.write(
                'expected repeated_times: {}, actual repeated_times: {}, mean results as follows:\n'
                .format(repeated_times, len(Macro_F1_list)))
            fr.write('\t\t Macro_F1 = {}\n'.format(mean_Macro_F1))
            fr.write('\t\t Micro_F1 = {}\n'.format(mean_Micro_F1))
            fr.write('details:\n')
            for repeat in range(len(Macro_F1_list)):
                fr.write(
                    '\t repeated {}/{}: Macro_F1 = {}, Micro_F1 = {}\n'.format(
                        repeat + 1, len(Macro_F1_list), Macro_F1_list[repeat],
                        Micro_F1_list[repeat]))
            fr_total.write('%.4f, %.4f    ' % (mean_Macro_F1, mean_Micro_F1))
            summary.value.add(tag='macro_train_{}'.format(train_ratio),
                              simple_value=mean_Macro_F1)
            summary.value.add(tag='micro_train_{}'.format(train_ratio),
                              simple_value=mean_Micro_F1)

        fr.write(
            '\neval case: classify completed in {}s\n'.format(time.time() -
                                                              time_start))
        fr.close()
        fr_total.write('\n')
        fr_total.flush()
        summary_writer.add_summary(summary, cur_step)
        summary_writer.flush()
        logger.info(
            'classify completed in {}s\n================================='.
            format(time.time() - time_start))

        # copy ckpt-files according to last mean_Micro_F1 (0.9 ratio).
        if mean_Micro_F1 > best_micro:
            best_micro = mean_Micro_F1

            ckptIsExists = os.path.exists(
                os.path.join(ckpt_dir, 'model.ckpt-%d.index' % cur_step))
            if ckptIsExists:
                fr_best = open(os.path.join(classify_dir, 'best_ckpt.info'),
                               'w')
            else:
                fr_best = open(os.path.join(classify_dir, 'best_ckpt.info'),
                               'a')
                fr_best.write(
                    "Note:the model.ckpt-best is the remainings of last best_ckpt!\n"
                    "the current best_ckpt model is loss, but the result is:\n"
                )
            fr_best.write("best_micro(for ratio 0.9): {}\n".format(best_micro))
            fr_best.write("best_ckpt: ckpt-{}\n".format(cur_step))
            fr_best.close()

            if ckptIsExists:
                sourceFile = os.path.join(
                    ckpt_dir, 'model.ckpt-%d.data-00000-of-00001' % cur_step)
                targetFile = os.path.join(
                    classify_dir, 'model.ckpt-best.data-00000-of-00001')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)
                sourceFile = os.path.join(ckpt_dir,
                                          'model.ckpt-%d.index' % cur_step)
                targetFile = os.path.join(classify_dir,
                                          'model.ckpt-best.index')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)
                sourceFile = os.path.join(ckpt_dir,
                                          'model.ckpt-%d.meta' % cur_step)
                targetFile = os.path.join(classify_dir, 'model.ckpt-best.meta')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)

        last_step = cur_step

    fr_total.close()
    summary_writer.close()
예제 #8
0
def eval_once(options):
    # visual_dir, visual_file = os.path.split(options.visualization_path)
    if not utils.check_rebuild(options.visualization_path, descrip='visualization', always_rebuild=options.always_rebuild):
        return
    # print logger
    logger.info('eval case: visualization...')
    logger.info('\t data_dir = {}'.format(options.data_dir))
    logger.info('\t data_name = {}'.format(options.data_name))
    logger.info('\t isdirected = {}'.format(options.isdirected))
    logger.info('\t label_path = {}'.format(options.label_path))
    logger.info('\t label_size = {}'.format(options.label_size))
    logger.info('\t eval_node_type: {}'.format(options.eval_node_type))
    logger.info('\t save_path: {}\n'.format(options.visualization_path))
    logger.info('\t method: t-SNE')
    logger.info('\t multilabel_rule: {}'.format(options.multilabel_rule))
    logger.info('\t marker_size: {}'.format(options.marker_size))
    logger.info('\t eval_online: {}'.format(options.eval_online))


    # get embedding vectors and markersize
    logger.info('\t reading labeled data from file {}'.format(options.label_path))
    time_start = time.time()
    id_list, labels_list = utils.get_labeled_data(options.label_path, type=options.eval_node_type,
                                                  multilabel_rule=options.multilabel_rule,
                                                  type_filepath=os.path.join(options.data_dir,
                                                                             options.data_name + ".nodes"))
    id_list, features_matrix, labels_list = utils.get_vectors(utils.get_KeyedVectors(options.vectors_path), id_list, labels_list)
    labels_matrix = np.array([item[0] for item in labels_list])
    logger.info('\t reading labeled data completed in {}s'.format(time.time() - time_start))
    logger.info('\t total labeled data size: {}'.format(np.size(features_matrix,axis=0)))
    logger.info('\t the labels data embedding_dimension: {}'.format(np.size(features_matrix,axis=1)))
    logger.info('\t total labels size: {}'.format(options.label_size))
    for i in range(options.label_size):
        logger.info('\t\t label {}: {}'.format(i, np.sum(labels_matrix == i)))

    fr = open(options.visualization_path, 'w')
    fr.write('eval case: visualization...\n')
    fr.write('\t data_dir = {}\n'.format(options.data_dir))
    fr.write('\t data_name = {}\n'.format(options.data_name))
    fr.write('\t isdirected = {}\n'.format(options.isdirected))
    fr.write('\t label_path = {}\n'.format(options.label_path))
    fr.write('\t label_size = {}\n'.format(options.label_size))
    fr.write('\t eval_node_type: {}\n'.format(options.eval_node_type))
    fr.write('\t save_path: {}\n\n'.format(options.visualization_path))
    fr.write('\t method: t-SNE\n')
    fr.write('\t multilabel_rule: {}\n'.format(options.multilabel_rule))
    fr.write('\t marker_size: {}\n'.format(options.marker_size))
    fr.write('\t eval_online: {}\n'.format(options.eval_online))
    fr.write('\t total labeled data size: {}\n'.format(np.size(features_matrix, axis=0)))
    fr.write('\t the labels data embedding_dimension: {}\n'.format(np.size(features_matrix, axis=1)))
    fr.write('\t total labels size: {}\n'.format(options.label_size))
    for i in range(options.label_size):
        fr.write('\t\t label {}: {}\n'.format(i, np.sum(labels_matrix==i)))

    figure_name = "visualization_" + str(np.size(features_matrix, axis=1))
    figure_path = os.path.join(os.path.split(options.visualization_path)[0],figure_name)
    CCD = plot_embedding_in_2D(Markersize=options.marker_size,
                               features_matrix=features_matrix,
                               labels_matrix=labels_matrix,
                               label_size=options.label_size,
                               figure_path = figure_path)

    fr.write('\n figure_path: {}\n'.format(figure_path))
    fr.write(' clustering_center_distance_sim: {}\n'.format(CCD))
    fr.write('\neval case: visualization completed in {}s\n ======================'.format(time.time() - time_start))
    fr.close()
    logger.info('eval case: visualization completed in {}s\n ======================'.format(time.time() - time_start))
예제 #9
0
def eval_online(options):
    visual_dir = os.path.split(options.visualization_path)[0]
    if not utils.check_rebuild(visual_dir, descrip='visualization', always_rebuild=options.always_rebuild):
        return
    if not os.path.exists(visual_dir):
        os.makedirs(visual_dir)

    # print logger
    logger.info('eval case: visualization...')
    logger.info('\t data_dir = {}'.format(options.data_dir))
    logger.info('\t data_name = {}'.format(options.data_name))
    logger.info('\t isdirected = {}'.format(options.isdirected))
    logger.info('\t label_path = {}'.format(options.label_path))
    logger.info('\t label_size = {}'.format(options.label_size))
    logger.info('\t eval_node_type: {}'.format(options.eval_node_type))
    logger.info('\t save_dir: {}\n'.format(visual_dir))
    logger.info('\t method: t-SNE')
    logger.info('\t multilabel_rule: {}'.format(options.multilabel_rule))
    logger.info('\t marker_size: {}'.format(options.marker_size))
    logger.info('\t eval_online: {}'.format(options.eval_online))
    logger.info('\t eval_interval: {}s'.format(options.eval_interval))


    logger.info('\t reading labeled data from file {}'.format(options.label_path))
    # get embedding vectors and markersize
    time_start = time.time()
    id_list_totoal, labels_list_totoal = utils.get_labeled_data(options.label_path, type=options.eval_node_type,
                                                                multilabel_rule=options.multilabel_rule,
                                                                type_filepath=os.path.join(options.data_dir,
                                                                                           options.data_name + ".nodes"))
    logger.info('\t reading labeled data completed in {}s'.format(time.time() - time_start))

    logger.info('\t total labeled data size: {}'.format(len(id_list_totoal)))
    logger.info('\t total labels size: {}'.format(options.label_size))


    fr_total = open(options.visualization_path, 'w')
    fr_total.write('eval case: visualization...\n')
    fr_total.write('\t data_dir = {}\n'.format(options.data_dir))
    fr_total.write('\t data_name = {}\n'.format(options.data_name))
    fr_total.write('\t isdirected = {}\n'.format(options.isdirected))
    fr_total.write('\t label_path = {}\n'.format(options.label_path))
    fr_total.write('\t label_size = {}\n'.format(options.label_size))
    fr_total.write('\t eval_node_type: {}\n'.format(options.eval_node_type))
    fr_total.write('\t save_dir: {}\n\n'.format(visual_dir))
    fr_total.write('\t method: t-SNE\n')
    fr_total.write('\t multilabel_rule: {}\n'.format(options.multilabel_rule))
    fr_total.write('\t marker_size: {}\n'.format(options.marker_size))
    fr_total.write('\t eval_online: {}\n'.format(options.eval_online))
    fr_total.write('\t eval_interval: {}s\n'.format(options.eval_interval))
    fr_total.write('\t total labeled data size: {}\n'.format(len(id_list_totoal)))
    fr_total.write('\t total labels size: {}\n'.format(options.label_size))
    fr_total.write('\t results(CCD-clustering_center_distance_sim):\n'
                   '=============================================================\n')
    fr_total.write('finish_time\tckpt\tCCD\n')


    last_step = 0
    summary_writer = tf.summary.FileWriter(visual_dir, tf.Graph())
    summary = tf.Summary()
    summary.value.add(tag='CCD', simple_value=0.)
    summary_writer.add_summary(summary, last_step)

    best_CCD = 0

    ckpt_dir = os.path.join(os.path.split(options.vectors_path)[0], 'ckpt')
    ckpt = tf.train.get_checkpoint_state(ckpt_dir)
    while (not (ckpt and ckpt.model_checkpoint_path)):
        logger.info("model and vectors not exist, waiting...")
        time.sleep(options.eval_interval)
        ckpt = tf.train.get_checkpoint_state(ckpt_dir)

    reading = options.vectors_path + ".reading_visualization_{}".format(options.eval_node_type)
    writing = options.vectors_path + ".writing"

    while (options.eval_online):
        while True:
            ckpt = tf.train.get_checkpoint_state(ckpt_dir)
            cur_step = int(ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
            if cur_step <= last_step or (not os.path.exists(options.vectors_path)) or os.path.exists(writing):
                if os.path.exists(os.path.join(os.path.split(options.vectors_path)[0], "RUN_SUCCESS")):
                    return
                time.sleep(options.eval_interval)
                continue
            # ready for reading
            logger.info("\t declare for reading ...")
            open(reading, "w")  # declare
            time.sleep(30)
            ckpt = tf.train.get_checkpoint_state(ckpt_dir)
            cur_step = int(ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
            if cur_step <= last_step or (not os.path.exists(options.vectors_path)) or os.path.exists(writing):
                os.remove(reading)  # undeclare
                logger.info("\t confliction! undeclare and waiting ...")
                time.sleep(options.eval_interval)
                continue
            break
        logger.info("\t eval ckpt-{}.......".format(cur_step))
        time_start = time.time()
        logger.info('\t reading embedding vectors from file {}'.format(options.vectors_path))
        id_list, features_matrix, labels_list = utils.get_vectors(utils.get_KeyedVectors(options.vectors_path),
                                                         id_list_totoal, labels_list_totoal)
        os.remove(reading)  # synchrolock for multi-process
        logger.info("\t done for reading ...")
        labels_matrix = np.array([item[0] for item in labels_list])
        logger.info('\t reading labeled data completed in {}s'.format(time.time() - time_start))
        logger.info('\t total labeled data size: {}'.format(np.size(features_matrix, axis=0)))
        logger.info('\t total labels size: {}'.format(options.label_size))
        for i in range(options.label_size):
            logger.info('\t\t label {}: {}'.format(i, np.sum(labels_matrix == i)))

        # visualization
        fr = open(options.visualization_path + '.{}'.format(cur_step), 'w')
        fr.write('eval case: visualization...\n')
        fr.write('\t data_dir = {}\n'.format(options.data_dir))
        fr.write('\t data_name = {}\n'.format(options.data_name))
        fr.write('\t isdirected = {}\n'.format(options.isdirected))
        fr.write('\t label_path = {}\n'.format(options.label_path))
        fr.write('\t label_size = {}\n'.format(options.label_size))
        fr.write('\t eval_node_type: {}\n'.format(options.eval_node_type))
        fr.write('\t method: t-SNE\n')
        fr.write('\t multilabel_rule: {}\n'.format(options.multilabel_rule))
        fr.write('\t marker_size: {}\n'.format(options.marker_size))
        fr.write('\t eval_online: {}\n'.format(options.eval_online))
        fr.write('\t eval_interval: {}s\n'.format(options.eval_interval))
        fr.write('\t total labeled data size: {}\n'.format(np.size(features_matrix, axis=0)))
        fr.write('\t total labels size: {}\n'.format(options.label_size))
        for i in range(options.label_size):
            fr.write('\t\t label {}: {}\n'.format(i, np.sum(labels_matrix == i)))

        fr_total.write('%s ckpt-%-9d: ' % (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), cur_step))
        summary = tf.Summary()

        figure_name = "visualization_" + str(np.size(features_matrix, axis=1)) + '.{}'.format(cur_step)
        figure_path = os.path.join(visual_dir, figure_name)
        CCD = plot_embedding_in_2D(Markersize=options.marker_size,
                                   features_matrix=features_matrix,
                                   labels_matrix=labels_matrix,
                                   label_size=options.label_size,
                                   figure_path=figure_path)

        fr.write('\n figure_path: {}\n'.format(figure_path))
        fr.write(' clustering_center_distance_sim:{}\n'.format(CCD))
        fr.write('\neval case: visualization completed in {}s\n ======================'.format(time.time() - time_start))
        fr.close()

        fr_total.write('%.4f\n' % CCD)
        fr_total.flush()
        summary.value.add(tag='CCD', simple_value=CCD)
        summary_writer.add_summary(summary, cur_step)
        summary_writer.flush()
        logger.info('visualization completed in {}s\n================================='.format(time.time() - time_start))

        # copy ckpt-files according to last mean_Micro_F1 (0.9 ratio).
        if CCD > best_CCD:
            best_CCD = CCD

            ckptIsExists = os.path.exists(os.path.join(ckpt_dir, 'model.ckpt-%d.index' % cur_step))
            if ckptIsExists:
                fr_best = open(os.path.join(visual_dir, 'best_ckpt.info'), 'w')
            else:
                fr_best = open(os.path.join(visual_dir, 'best_ckpt.info'), 'a')
                fr_best.write("Note:the model.ckpt-best is the remainings of last best_ckpt!\n"
                              "the current best_ckpt model is loss, but the result is:\n")
            fr_best.write("best_CCD: {}\n".format(best_CCD))
            fr_best.write("best_ckpt: ckpt-{}\n".format(cur_step))
            fr_best.close()

            if ckptIsExists:
                sourceFile = os.path.join(ckpt_dir, 'model.ckpt-%d.data-00000-of-00001' % cur_step)
                targetFile = os.path.join(visual_dir, 'model.ckpt-best.data-00000-of-00001')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)
                sourceFile = os.path.join(ckpt_dir, 'model.ckpt-%d.index' % cur_step)
                targetFile = os.path.join(visual_dir, 'model.ckpt-best.index')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)
                sourceFile = os.path.join(ckpt_dir, 'model.ckpt-%d.meta' % cur_step)
                targetFile = os.path.join(visual_dir, 'model.ckpt-best.meta')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)
        last_step = cur_step

    fr_total.close()
    summary_writer.close()
    return