Exemplo n.º 1
0
def run_infer(config, loaded_infer_model, infer_sess, pred_file):
    logger.info("  inference to output %s." % pred_file)

    infer_data = data_helper.load_data(config.infer_file, config.word_vocab_file, config.char_vocab_file,
                                       w_max_len1=config.max_word_len1,
                                       w_max_len2=config.max_word_len2,
                                       c_max_len1=config.max_char_len1,
                                       c_max_len2=config.max_char_len2,
                                       text_split="|", split="\t",
                                       mode="infer")
    infer_iterator = data_helper.batch_iterator(infer_data, batch_size=config.infer_batch_size, shuffle=False, mode="infer")

    pred_labels = []
    lines = open(config.infer_file, "r", encoding="utf-8").readlines()

    with open(pred_file, mode="w", encoding="utf-8") as pred_f:
        pred_f.write("")

        while True:
            try:
                batch = next(infer_iterator)
                b_word_ids1, b_word_ids2, b_word_len1, b_word_len2, b_char_ids1, b_char_ids2, b_char_len1, b_char_len2 = batch
                pred = loaded_infer_model.infer(infer_sess, b_word_ids1, b_word_ids2, b_word_len1, b_word_len2, b_char_ids1, b_char_ids2, b_char_len1, b_char_len2)
                pred_labels.extend(pred)
            except StopIteration:
                logger.info("  Done inference.")
                break

        for line, p in zip(lines, pred_labels):
            res = line.strip() + "\t" + str(p) + "\n"
            pred_f.write(res)
Exemplo n.º 2
0
def run_test(config, infer_model, infer_sess, data_file, model_dir):
    output_file = "output_" + os.path.split(data_file)[-1].split(".")[0]
    pred_file = os.path.join(model_dir, output_file)
    logger.info("  predictions to output %s." % pred_file)

    with infer_model.graph.as_default():
        loaded_infer_model, global_step = model_helper.create_or_load_model(
            infer_model.model, model_dir, infer_sess, "infer")

        # running_vars = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope="metrics")
        # running_vars_initializer = tf.variables_initializer(var_list=running_vars)

        # TODO: tf.metrics
        # infer_sess.run(running_vars_initializer)
        infer_sess.run(tf.local_variables_initializer())

    infer_data = data_helper.load_data(data_file,
                                       config.word_vocab_file,
                                       config.char_vocab_file,
                                       w_max_len1=config.max_word_len1,
                                       w_max_len2=config.max_word_len2,
                                       c_max_len1=config.max_char_len1,
                                       c_max_len2=config.max_char_len2,
                                       text_split="|",
                                       split="\t",
                                       mode="infer")
    infer_iterator = data_helper.batch_iterator(infer_data,
                                                batch_size=config.batch_size,
                                                shuffle=False,
                                                mode="infer")

    start_time = time.time()
    step = 0
    pred_labels = []
    lines = open(data_file, "r", encoding="utf-8").readlines()
    with open(pred_file, mode="w", encoding="utf-8") as pred_f:
        pred_f.write("")
        while True:
            try:
                b_word_ids1, b_word_ids2, b_word_len1, b_word_len2, b_char_ids1, b_char_ids2, b_char_len1, b_char_len2 = next(
                    infer_iterator)
                pred = loaded_infer_model.infer(infer_sess, b_word_ids1,
                                                b_word_ids2, b_word_len1,
                                                b_word_len2, b_char_ids1,
                                                b_char_ids2, b_char_len1,
                                                b_char_len2)
                pred_labels.extend(pred)
                step += 1
            except StopIteration:
                break
        end_time = time.time()
        for line, p in zip(lines, pred_labels):
            res = line.strip() + "\t" + str(p) + "\n"
            pred_f.write(res)

    step_time = (end_time - start_time) / step
    logger.info("# predict step time %.4fs" % step_time)
Exemplo n.º 3
0
def test():
    print("process the image to h5file.....")
    test_dir = flags.test_dir
    test_h5_dir = flags.test_h5_dir
    stride = flags.test_stride
    if not os.path.exists(test_h5_dir):
        os.makedirs(test_h5_dir)

    test_set5 = os.path.join(test_dir, 'Set5')
    test_set14 = os.path.join(test_dir, 'Set14')
    path_set5 = os.path.join(test_h5_dir, 'Set5')
    path_set14 = os.path.join(test_h5_dir, 'Set14')
    data_helper.gen_input_image(test_set5, path_set5, stride)
    data_helper.gen_input_image(test_set14, path_set14, stride)

    print("initialize the model......")
    model_dir = flags.model_dir
    model = SRCNN(flags)
    model.build_graph()
    saver = tf.train.Saver()
    ckpt = tf.train.get_checkpoint_state(model_dir)
    if ckpt and ckpt.model_checkpoint_path:
        saver.restore(model.sess, ckpt.model_checkpoint_path)
    else:
        print("model info didn't exist!")
        raise ValueError

    print("test in Set5......")
    test_h5_path = os.path.join(path_set5, "data.h5")
    data_set5, label_set5 = data_helper.load_data(test_h5_path)
    accu = model.test(data_set5, label_set5)
    print("the accuracy in Set5 is %.5f", accu)

    print("test in Set14......")
    test_h5_path = os.path.join(path_set14, "data.h5")
    data_set14, label_set14 = data_helper.load_data(test_h5_path)
    accu2 = model.test(data_set14, label_set14)
    print("the accuracy in Set14 is %.5f", accu2)
Exemplo n.º 4
0
def train():
    print("process the image to h5file.....")
    data_dir = flags.data_dir
    h5_dir = flags.h5_dir
    stride = flags.train_stride
    data_helper.gen_input_image(data_dir, h5_dir, stride)

    print("reading data......")
    h5_path = os.path.join(h5_dir, "data.h5")
    data, label = data_helper.load_data(h5_path)

    print("initialize the model......")
    model = SRCNN(flags)
    model.build_graph()
    model.train(data, label)
Exemplo n.º 5
0
def mode_evaluate(config, input_path):
    """
    执行eval模式。评估模型。

    :param config: 配置文件
    :param input_path: 数据集路径
    :return: 无
    """
    # 读入数据
    x_test, y_test = load_data(
        os.path.join(input_path, "data_test.txt"),
        sample_ratio=config.data_sample_ratio,
        n_class=config.n_class,
        one_hot=config.one_hot,
    )
    print("成功载入测试集文件")
    # 读取已有字典
    my_vocab = load_vocabulary(max_vocab_size=config.max_vocab_size)
    config.vocab_size = my_vocab.vocab_size
    print("载入已有字典, 字典实际大小:{} , 字典设置大小: {}".format(
        len(my_vocab.word_index) + 1, config.vocab_size
    ))

    # 数据预处理(转化为id表示,并padding)
    x_test = data_preprocessing(x_test, my_vocab, max_len=config.max_len)
    print("Test  Set size: %d" % len(x_test))

    config.keep_prob = 1.0
    # 创建分类器
    classifier = choose_model_by_name(config)
    classifier.build_graph()

    # 创建测试集的batcher
    test_batcher = Batcher(x_test, y_test, batch_size=config.batch_size)
    # 开始评估模型
    evaluate(classifier, config, test_batcher)
Exemplo n.º 6
0
tf.app.flags.DEFINE_string('data_path', '../text_data/input_data/', 'input data path')
# Model params
tf.app.flags.DEFINE_string("filter_sizes", "2,3,4", "textcnn model, convolution filter sizes")
tf.app.flags.DEFINE_integer("num_filters", 2, "textcnn model, convolution filter nums")
tf.app.flags.DEFINE_integer("num_classes", 2, "num_classes")
tf.app.flags.DEFINE_float("keep_prob", 0.5, "Dropout keep probability (default: 0.5)")
tf.app.flags.DEFINE_integer("hidden_num", 2, "Number of RNNCell num")
tf.app.flags.DEFINE_integer("hidden_size", 2, "Number of RNN layers")
# Training params
tf.app.flags.DEFINE_float("learning_rate", 0.01, "learning_rate (default: 0.01)")
tf.app.flags.DEFINE_integer("epochs", 10, "Number of training epochs (default: 10)")
tf.app.flags.DEFINE_integer("batch_size", 512, "Batch Size (default: 64)")
tf.app.flags.DEFINE_integer("checkpoint_every", 100, "Save model every steps (default: 100)")
tf.app.flags.DEFINE_string("checkpoint_dir", './model_save/', "checkpoint_dir")

train_x, train_y, valid_x, valid_y, embedding, word2index, index2word, vocab_size, maxlen = data_helper.load_data('../text_data/input_data/')
print(train_x.shape)
print(vocab_size)
print(embedding.shape)
print(embedding.dtype)
print(maxlen)


# model = FastText(
#     num_classes=FLAGS.num_classes,
#     sequence_length=maxlen,
#     w2v_model_embedding=embedding,
#     vocab_size=vocab_size,
#     embedding_size=200)

# model = TextCNN(filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
    # './data/vaccine/vaccine_month_sample.tsv',
    # './data/vaccine/vaccine_year_sample.tsv',
    # './data/parties/parties_year_sample.tsv',
    # './data/aware/aware_month_sample.tsv',
    # './data/economy/economy_rel_month_sample.tsv',
    # './data/economy/economy_rel_year_sample.tsv',
    './data/amazon/amazon_review_month_sample.tsv',
    './data/amazon/amazon_review_year_sample.tsv',
    './data/yelp/yelp_Hotels_month_sample.tsv',
    './data/yelp/yelp_Hotels_year_sample.tsv',
    './data/yelp/yelp_Restaurants_month_sample.tsv',
    './data/yelp/yelp_Restaurants_year_sample.tsv',
]

for data_path in file_list:
    dataset = data_helper.load_data(data_path)
    paths = data_path.split('/')
    paths[1] = 'features'
    paths[-1] = paths[-1][:-11]
    outp = '/'.join(paths)
    for ftype in ['tfidf', 'binary']:
        tmp_path = outp + '_' + ftype + '.pkl'
        tmp_path = data_helper.train_fvs_da(dataset,
                                            outputfile=outp,
                                            balance=False,
                                            fea_type=ftype)
        #fvs_file = pickle.load(open(tmp_path, 'rb'))

        print(tmp_path)
        #for balance in [True, False]:
        #    print('\t----------------Balance: ' + str(balance) + '---------------')
Exemplo n.º 8
0
def train(config, model_creator):
    steps_per_stats = config.steps_per_stats
    steps_per_eval = config.steps_per_eval
    model_dir = config.model_dir
    log_dir = config.log_dir
    ckpt_name = config.ckpt_name
    ckpt_path = os.path.join(model_dir, ckpt_name)

    # Create model
    train_model = model_helper.create_model(model_creator, config, "train")
    eval_model = model_helper.create_model(model_creator, config, "eval")
    # infer_model = model_helper.create_model(model_creator, config, "infer")

    train_data = data_helper.load_data(config.train_file,
                                       config.word_vocab_file,
                                       config.char_vocab_file,
                                       w_max_len1=config.max_word_len1,
                                       w_max_len2=config.max_word_len2,
                                       c_max_len1=config.max_char_len1,
                                       c_max_len2=config.max_char_len2,
                                       text_split="|",
                                       split="\t")
    train_iterator = data_helper.batch_iterator(train_data,
                                                batch_size=config.batch_size,
                                                shuffle=True)

    eval_data = data_helper.load_data(config.dev_file,
                                      config.word_vocab_file,
                                      config.char_vocab_file,
                                      w_max_len1=config.max_word_len1,
                                      w_max_len2=config.max_word_len2,
                                      c_max_len1=config.max_char_len1,
                                      c_max_len2=config.max_char_len2,
                                      text_split="|",
                                      split="\t")
    # eval_iterator = data_helper.batch_iterator(eval_data, batch_size=config.batch_size, shuffle=False)

    # TensorFlow model
    session_config = utils.get_config_proto()
    train_sess = tf.Session(config=session_config, graph=train_model.graph)
    eval_sess = tf.Session(config=session_config, graph=eval_model.graph)
    # infer_sess = tf.Session(config=config, graph=infer_model.graph)

    # Summary Writer
    train_summary_writer = tf.summary.FileWriter(
        os.path.join(log_dir, "train_log"), train_model.graph)
    eval_summary_writer = tf.summary.FileWriter(
        os.path.join(log_dir, "eval_log"), eval_model.graph)

    with train_model.graph.as_default():
        loaded_train_model, global_step = model_helper.create_or_load_model(
            train_model.model, model_dir, train_sess, "train")
        local_initializer = tf.local_variables_initializer()

        # running_vars = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope="metrics")
        # running_vars_initializer = tf.variables_initializer(var_list=running_vars)

    step_time, train_loss, train_acc, train_rec, train_pre, train_f1, train_auc, gN = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
    lr = loaded_train_model.learning_rate.eval(session=train_sess)
    last_stat_step = global_step
    last_eval_step = global_step

    logger.info("# Start step %d" % global_step)

    epoch_idx = 0
    while epoch_idx < config.num_train_epochs:
        start_time = time.time()
        try:
            # TODO: tf.metrics
            # train_sess.run(running_vars_initializer)
            train_sess.run(local_initializer)

            batch = next(train_iterator)
            b_word_ids1, b_word_ids2, b_word_len1, b_word_len2, b_char_ids1, b_char_ids2, b_char_len1, b_char_len2, b_labels = batch
            # for b in batch:
            #     print(b)
            train_summary1, pred, step_loss, _, acc_op, rec_op, pre_op, auc_op, global_step, grad_norm, lr = \
                loaded_train_model.train(train_sess, b_word_ids1, b_word_ids2, b_word_len1, b_word_len2,
                                         b_char_ids1, b_char_ids2, b_char_len1, b_char_len2, b_labels)
            train_summary2, step_acc, step_rec, step_pre, step_auc = \
                train_sess.run([loaded_train_model.train_summary2,
                                loaded_train_model.accuracy,
                                loaded_train_model.recall,
                                loaded_train_model.precision,
                                loaded_train_model.auc])
            config.epoch_step += 1

        except StopIteration:
            # Finished going through the training dataset.  Go to next epoch.
            epoch_idx += 1
            config.epoch_step = 0
            train_iterator = data_helper.batch_iterator(
                train_data, batch_size=config.batch_size, shuffle=True)
            continue

        step_time += (time.time() - start_time)
        train_loss += step_loss
        train_acc += step_acc
        train_rec += step_rec
        train_pre += step_pre
        train_auc += step_auc
        gN += grad_norm

        if global_step - last_stat_step >= steps_per_stats:
            last_stat_step = global_step
            step_time /= steps_per_stats
            train_loss /= steps_per_stats
            train_acc /= steps_per_stats
            train_rec /= steps_per_stats
            train_pre /= steps_per_stats
            train_f1 = (2 * train_rec * train_pre) / (train_rec + train_pre +
                                                      0.00000001)
            gN /= steps_per_stats

            logger.info(
                "  step %d lr %g step_time %.2fs loss %.4f acc %.4f rec %.4f pre %.4f f1 %.4f auc %.4f gN %.2f"
                % (global_step, lr, step_time, train_loss, train_acc,
                   train_rec, train_pre, train_f1, train_auc, grad_norm))
            train_summary_writer.add_summary(train_summary1,
                                             global_step=global_step)
            train_summary_writer.add_summary(train_summary2,
                                             global_step=global_step)
            step_time, train_loss, train_acc, train_rec, train_pre, train_f1, train_auc, gN = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0

        if global_step - last_eval_step >= steps_per_eval:
            last_eval_step = global_step
            # Save checkpoint
            loaded_train_model.saver.save(train_sess,
                                          ckpt_path,
                                          global_step=global_step)
            # Evaluate on dev
            run_eval(config,
                     eval_model,
                     eval_sess,
                     eval_data,
                     model_dir,
                     ckpt_name,
                     eval_summary_writer,
                     save_on_best=True)

    logger.info("# Finished epoch %d, step %d." % (epoch_idx, global_step))

    # Done training
    loaded_train_model.saver.save(train_sess,
                                  ckpt_path,
                                  global_step=global_step)
    logger.info(
        "# Final, step %d lr %g step_time %.2fs loss %.4f acc %.4f rec %.4f pre %.4f f1 %.4f auc %.4f gN %.2f"
        % (global_step, lr, step_time, train_loss, train_acc, train_rec,
           train_pre, train_f1, train_auc, gN))
    logger.info("# Done training!")

    train_summary_writer.close()
    eval_summary_writer.close()
def train_step(config_disc, config_evl):

    print("loading the disc train set")
    config = config_disc
    eval_config = config_evl
    eval_config.keep_prob = 1.0

    train_data, valid_data, test_data = data_helper.load_data(
        True, config.max_len, batch_size=config.batch_size)

    print("begin training")

    # gpu_config=tf.ConfigProto()
    # gpu_config.gpu_options.allow_growth=True
    with tf.Graph().as_default(), tf.Session() as session:
        print("model training")
        initializer = tf.random_uniform_initializer(-1 * config.init_scale,
                                                    1 * config.init_scale)
        with tf.variable_scope("model", reuse=None, initializer=initializer):
            #model = disc_rnn_model.disc_rnn_model(config=config,is_training=True)
            model = create_model(session, config, is_training=True)

        with tf.variable_scope("model", reuse=True, initializer=initializer):
            #valid_model = disc_rnn_model.disc_rnn_model(config=eval_config,is_training=False)
            #test_model = disc_rnn_model.disc_rnn_model(config=eval_config,is_training=False)
            valid_model = create_model(session, eval_config, is_training=False)
            test_model = create_model(session, eval_config, is_training=False)

        #add summary
        # train_summary_op = tf.merge_summary([model.loss_summary,model.accuracy])
        train_summary_dir = os.path.join(config.out_dir, "summaries", "train")
        train_summary_writer = tf.summary.FileWriter(train_summary_dir,
                                                     session.graph)

        # dev_summary_op = tf.merge_summary([valid_model.loss_summary,valid_model.accuracy])
        dev_summary_dir = os.path.join(eval_config.out_dir, "summaries", "dev")
        dev_summary_writer = tf.summary.FileWriter(dev_summary_dir,
                                                   session.graph)

        #add checkpoint
        checkpoint_dir = os.path.abspath(
            os.path.join(config.out_dir, "checkpoints"))
        checkpoint_prefix = os.path.join(checkpoint_dir, "disc.model")
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        #saver = tf.train.Saver(tf.all_variables())

        tf.global_variables_initializer().run()
        global_steps = 1
        begin_time = int(time.time())

        for i in range(config.num_epoch):
            print("the %d epoch training..." % (i + 1))
            lr_decay = config.lr_decay**max(i - config.max_decay_epoch, 0.0)
            model.assign_new_lr(session, config.lr * lr_decay)
            global_steps = run_epoch(model, session, train_data, global_steps,
                                     valid_model, valid_data,
                                     config_disc.batch_size,
                                     train_summary_writer, dev_summary_writer)

            if i % config.checkpoint_every == 0:
                path = model.saver.save(session, checkpoint_prefix,
                                        global_steps)
                print("Saved model chechpoint to{}\n".format(path))

        print("the train is finished")
        end_time = int(time.time())
        print("training takes %d seconds already\n" % (end_time - begin_time))
        test_accuracy = evaluate(test_model, session, test_data,
                                 config_disc.batch_size)
        print("the test data accuracy is %f" % test_accuracy)
        print("program end!")
Exemplo n.º 10
0
def mode_train(config, input_path):
    """
    执行train模式。按照给定配置,训练模型。

    :param config: 配置文件
    :param input_path: 数据集路径
    :return: 无
    """
    # 读入训练集和测试集
    x_train, y_train = load_data(
        os.path.join(input_path, "data_train.txt"),
        sample_ratio=config.data_sample_ratio,
        n_class=config.n_class,
        one_hot=config.one_hot,
    )
    print("成功载入训练集文件")
    x_test, y_test = load_data(
        os.path.join(input_path, "data_test.txt"),
        sample_ratio=config.data_sample_ratio,
        n_class=config.n_class,
        one_hot=config.one_hot,
    )
    print("成功载入测试集文件")
    # 获取验证集
    if os.path.isfile(os.path.join(input_path, "data_valid.txt")):
        # 从验证集文件中获取
        x_valid, y_valid = load_data(
            os.path.join(input_path, "data_test.txt"),
            sample_ratio=config.data_sample_ratio,
            n_class=config.n_class,
            one_hot=config.one_hot,
        )
        print("成功载入验证集文件")
    else:
        # 将测试集的一部分分割出来,作为验证集
        split_radio = config.valid_test_split_radio  # 设置分割比例
        x_test, x_valid, y_test, y_valid = split_dataset(
            x_test, y_test, split_radio)
        print("没有发现验证集文件,已分割测试集的 {}% 来作为验证集".format(split_radio * 100))

    # 创建字典
    my_vocab = make_vocabulary(x_train, max_vocab_size=config.max_vocab_size)
    config.vocab_size = my_vocab.vocab_size
    print("使用训练集数据 制作字典完成, 字典实际大小:{} , 字典设置大小: {}".format(
        len(my_vocab.word_index) + 1, config.vocab_size))

    # 数据预处理(转化为id表示,并padding)
    print('开始对数据集进行预处理 (word表示 -> id表示)')
    x_train = data_preprocessing(x_train, my_vocab, max_len=config.max_len)
    x_valid = data_preprocessing(x_valid, my_vocab, max_len=config.max_len)
    x_test = data_preprocessing(x_test, my_vocab, max_len=config.max_len)
    print("Train Set size: %d" % len(x_train))
    print("Valid Set size: %d" % len(x_valid))
    print("Test  Set size: %d" % len(x_test))

    # 创建分类器
    classifier = choose_model_by_name(config)
    classifier.build_graph()

    # 创建训练集、验证集、测试集的 batcher
    train_batcher = Batcher(x_train, y_train, batch_size=config.batch_size)
    valid_batcher = Batcher(x_valid, y_valid, batch_size=config.batch_size)
    test_batcher = Batcher(x_test, y_test, batch_size=config.batch_size)
    # 开始训练模型
    train(classifier, config, train_batcher, valid_batcher, test_batcher)
Exemplo n.º 11
0
import numpy as np
import matplotlib.pyplot as plt
from utils.data_helper import QuerySimilarityProcessor, load_data
# import pandas as pd
# pd.options.display.max_columns = None


processor = QuerySimilarityProcessor()
label_list = processor.get_labels()

num_labels = len(label_list)
config = BertConfig.from_pretrained("../data/bert-base-chinese/config.json", num_labels=num_labels)
model = BertForSequenceClassification.from_pretrained("../data/bert-base-chinese/")
tokenizer = BertTokenizer.from_pretrained('../data/bert-base-chinese/vocab.txt')

train_dataset = load_data("../data/ATEC/", processor, tokenizer)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=32)

valid_dataset = load_data("../data/ATEC/", processor, tokenizer,evaluate=True)
valid_sampler = RandomSampler(train_dataset)
valid_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=32)

config_class, model_class, tokenizer_class = [BertConfig, BertForSequenceClassification, BertTokenizer]
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
     'weight_decay': 0.0},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
     'weight_decay': 0.0}
]
Exemplo n.º 12
0
                     "how often to run a validation minibatch.")
flags.DEFINE_integer('validate_batch_size', 128,
                     "how many nodes per validation sample.")
flags.DEFINE_integer('print_every', 10, "How often to print training info.")
flags.DEFINE_integer('max_total_steps', 1000,
                     "Maximum total number of iterations")
flags.DEFINE_string('temporal_learner', 'LSTM',
                    'Which temporal learner to choose.')
flags.DEFINE_integer('gpu', 0, "which gpu to use.")

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ["CUDA_VISIBLE_DEVICES"] = str(FLAGS.gpu)

#  main procedure
print("### loading training data...")
train_data = load_data(FLAGS.train_prefix)

graphs_with_views = train_data[0]  #list,(10,4)
features = train_data[1]  #None
id_map = train_data[2]  #dict,35981
val_edges = train_data[4]  #list,(399,2)
context_pairs = train_data[3] if FLAGS.random_context else None
# pad with dummy zero vector
if not features is None:
    features = np.vstack([features, np.zeros((features.shape[1], ))])
# print(features) #None

print("### Initializing minibatch iterator...")
placeholders = construct_placeholders()
minibatch = EdgeMinibatchIterator(graphs_with_views,
                                  id_map,
Exemplo n.º 13
0
         'News data - economy'),
        ('./data/yelp/yelp_Restaurants_month_sample.tsv',
         './data/yelp/yelp_Restaurants_year_sample.tsv', 'yelp_rest',
         'Reviews data - restaurants'
         ),  # './data/yelp/yelp_Restaurants_month_sample.tsv'
    ]
    for pair in file_list:
        print(pair)
        for is_binary in [False]:  # True, skip binary currently
            # on month
            month_file = pair[0]
            year_file = pair[1]
            output = pair[2]

            if month_file:
                dataset = data_helper.load_data(month_file)
                # test on balanced data
                print('Test on balanced data')
                test_balance = cross_test_domain_clf(dataset,
                                                     domain2month,
                                                     data_name=None,
                                                     balance=True,
                                                     binary=is_binary)
                test_balance.to_csv('./tmp/' + output + '_month.tsv', sep='\t')
                viz_perform(
                    test_balance, pair[3], './image/' + output +
                    '/cross_clf_balance_month_' + str(is_binary) + '.pdf')
                test_balance = None

#                print('Test on unbalanced data')
#                test_unbalance = cross_test_domain_clf(dataset, domain2month, data_name=None, balance=False, binary=is_binary)