Exemplo n.º 1
0
def test(data, batch_size=64, filename='roc.png', **kwargs):
    global args

    assert args.checkpoint is not None
    model = RNNModel(feature_dims=data[0].feature_dim,
                     model_dir=args.output_dir,
                     **kwargs)
    model.restore(args.checkpoint)
    data = list(filter(lambda d: d.seq is not None, data))
    for i in tqdm(range(0, len(data), batch_size)):
        x, y, length = get_feature_label(data[i:i + batch_size],
                                         length_limit=10000)
        predictions = model.predict(x, length)
        for l, p in zip(data[i:i + batch_size], predictions):
            l.prediction = p
        # if SimpleLengthModel.data_filter(data[i]):
        # x, y, length = get_feature_label(data[i:i+batch_size], length_limit=1000)
        # predictions = model.predict(x, length)
        # for l,p in zip(data[i:i+batch_size], predictions):
        #     l.prediction = p
        # else:
        #     for l in data[i:i+batch_size]:
        #         l.prediction = 1 + l.length / 100000.0

    predictions = list(map(attrgetter('prediction'), data))
    labels = list(map(attrgetter('label'), data))

    plot_roc(predictions, labels, filename=filename)
Exemplo n.º 2
0
    def predicting(self):
        vocab_to_id = get_vocab_to_id(self.train_data_path, self.vocab_file,
                                      False)
        data_helper = DataHelper(vocab_to_id)

        reader = open("data/predict_data", 'r')
        writer = open("data/res", 'w')
        dishs = []
        file_data = []
        for line in reader.readlines():
            line = line.strip().decode("utf-8")
            line_split = line.split("\t")
            if len(line_split) != 2:
                continue
            type, dish_name = line_split
            dishs.append(dish_name)
            file_data.append(line_split)

        batch = data_helper.create_prediction_batch(dishs)
        with tf.Session() as sess:
            cnn_model = RNNModel(self.rnn_size,
                                 self.embedding_size, self.class_num,
                                 len(vocab_to_id), self.learning_rate,
                                 self.model_path)
            ckpt = tf.train.get_checkpoint_state(self.model_dir)
            cnn_model.saver.restore(sess, ckpt.model_checkpoint_path)
            prediction, pre_label = cnn_model.predict(sess, batch)
            pre_pre = sess.run(tf.nn.softmax(prediction))
            print pre_label, pre_pre, prediction
            for idx, sub_review_lable in enumerate(pre_label):
                writer.write("{}\t{}\t{}\n".format(file_data[idx][0],
                                                   file_data[idx][1],
                                                   sub_review_lable))
Exemplo n.º 3
0
def predict_dajare(args):
    dajare_raw = args.dajare
    weights_path = args.weights_path
    vocab_data_path = args.vocab_data_path

    tokenizer = TokenizerSpacy()

    dajare_words = tokenizer.tokenize_sentence(dajare_raw)
    logging.info(dajare_words)

    vocab = Vocab(vocab_data_path)
    dajare_labeled = vocab.convert_word2id(dajare_words)
    logging.info(dajare_labeled)

    batch_size = 30
    T = 25
    emb_size = 128
    hidden_size = 128
    dropout = 0.0
    lr = 1e-3
    vocab_size = vocab.vocab_num

    model = RNNModel(batch_size=batch_size,
                     vocab_size=vocab_size,
                     emb_size=emb_size,
                     hidden_size=hidden_size,
                     T=T,
                     dropout=dropout,
                     lr=lr,
                     model_path=None)
    model.print_fn = logging.info

    model.load_weights(weights_path)

    probability = model.predict(
        model.predict(np.array([dajare_labeled], dtype=np.float32)))
    logging.info('Probability:', probability[0])
    return probability
Exemplo n.º 4
0
def train(args):
    checkpoint_path = args.checkpoint_path
    dajare_sentence = args.query

    nlp = spacy.load('ja_ginza_nopn')

    words = nlp(dajare_sentence)
    words = [w.orth_ for w in words]

    batch_size = 32
    T = 32
    emb_size = 128
    hidden_size = 128
    dropout = 0.0
    lr = 1e-3

    data_gen = DataForGenerator(batch_size=batch_size, T=T)
    data_gen.load_vocab('./vocab.csv', vocab_size=50000)

    words_id, _ = data_gen.preprocess([words], None)

    vocab_size = len(data_gen.vocab.word2id)
    print("Vocab size: ", vocab_size)

    model = RNNModel(
        batch_size=batch_size,
        vocab_size=vocab_size,
        emb_size=emb_size,
        hidden_size=hidden_size,
        T=T,
        dropout=dropout,
        lr=lr,
        model_path=None)

    model.load_weights(checkpoint_path)

    print(words)
    print(words_id)

    pred = model.predict(words_id[0])

    print(pred)
    print(pred.shape)
Exemplo n.º 5
0
    def predicting_2(self):
        vocab_to_id = get_vocab_to_id(self.train_data_path, self.vocab_file,
                                      False)
        data_helper = DataHelper(vocab_to_id)
        data_generator = SentenceGenerator("data/other_data")
        batchManage = BatchManager(data_generator, self.batch_size,
                                   vocab_to_id)
        writer = open("data/res_other", "w")
        with tf.Session() as sess:
            models = RNNModel(self.rnn_size,
                              self.embedding_size, self.class_num,
                              len(vocab_to_id), self.learning_rate,
                              self.model_path)
            ckpt = tf.train.get_checkpoint_state(self.model_dir)
            models.saver.restore(sess, ckpt.model_checkpoint_path)

            for batchs in batchManage.getBatches():
                prediction, pre_label = models.predict(sess, batchs)
                for sub_review_lable in pre_label:
                    writer.write(str(sub_review_lable) + "\n")
Exemplo n.º 6
0
    def predicting_1(self):
        vocab_to_id = get_vocab_to_id(self.train_data_path, self.vocab_file,
                                      False)
        data_helper = DataHelper(vocab_to_id)

        dishs = [u"网上的口碑什么的蛮好的一家店 专门打了个电话让这边的师傅上门帮我们家的小宝宝理了一个头发"]
        batch = data_helper.create_prediction_batch(dishs)
        with tf.Session() as sess:
            cnn_model = RNNModel(self.rnn_size,
                                 self.embedding_size, self.class_num,
                                 len(vocab_to_id), self.learning_rate,
                                 self.model_path)
            ckpt = tf.train.get_checkpoint_state(self.model_dir)
            cnn_model.saver.restore(sess, ckpt.model_checkpoint_path)
            prediction, pre_label = cnn_model.predict(sess, batch)
            pre_pre = sess.run(tf.nn.softmax(prediction))
            print pre_label, pre_pre, prediction
            for idx, sub_review_lable in enumerate(pre_label):
                print "{}\t{}".format(
                    dishs[idx], data_helper.get_cats_name(sub_review_lable))