def lookup_JM(WIDTH, DEPTH):
    MAPPING_PATH = './Ch01_Data_load/data/dict.csv'
    lookup = pd.read_csv(MAPPING_PATH, encoding='cp949')
    keys = list(lookup.iloc[:, 0])
    values = list(lookup.iloc[:, 1])
    JM = jmu.JasoMapping(WIDTH=WIDTH, DEPTH=DEPTH, MAPPING_KEY=keys, MAPPING_VALUE=values)
    return JM
def generate_batch_jaso(INDEX, MODEL, DOC, LABEL, MAXLEN, SESS):
    jaso_splitted = jmu.jaso_split(DOC[INDEX], MAXLEN=MAXLEN)
    _input = SESS.run(MODEL.jaso_Onehot, {MODEL.X_Onehot: jaso_splitted})
    _, del_list = length(_input)
    _label = LABEL[INDEX]
    batch_input = np.delete(_input, del_list, axis=0)
    batch_label = np.delete(_label, del_list, axis=0)
    return batch_input, batch_label
def lookup_JM(WIDTH, DEPTH):
    MAPPING_PATH = '/Users/1003874/bsdev/Text-classification-with-CNN-RNN-with-Tensorflow/Ch01_Data_load/data/dict.csv'
    #'./Ch01_Data_load/data/dict.csv'
    lookup = pd.read_csv(MAPPING_PATH, encoding='cp949')
    keys = list(lookup.iloc[:, 0])
    values = list(lookup.iloc[:, 1])
    JM = jmu.JasoMapping(WIDTH=WIDTH,
                         DEPTH=DEPTH,
                         MAPPING_KEY=keys,
                         MAPPING_VALUE=values)
    return JM
Пример #4
0
def lookup_WM(MAXLEN, IMAGE_WIDTH, IMAGE_DEPTH):
    MAPPING_PATH = './Ch01_Data_load/data/vocab.npy'
    lookup = np.load(MAPPING_PATH)
    keys = list(lookup.iloc[:, 0])
    values = list(lookup.iloc[:, 1])
    JM = jmu.JasoMapping(MAXLEN=MAXLEN,
                         WIDTH=IMAGE_WIDTH,
                         DEPTH=IMAGE_DEPTH,
                         MAPPING_KEY=keys,
                         MAPPING_VALUE=values)
    return JM
Пример #5
0
def generate_batch_jaso(INDEX, MODEL, DOC, LABEL, MAXLEN, SESS, ATTENTION):
    if ATTENTION:
        jaso_splitted = jmu.jaso_split(DOC[INDEX], MAXLEN=MAXLEN)
        seq_len = np.array(list(map(lambda x: find_length(x), jaso_splitted)))
        seq_len = np.ceil(seq_len / (2**MODEL.MAX_POOL_TIME)).astype(np.int32)
        batch_input = SESS.run(MODEL.jaso_Onehot,
                               {MODEL.X_Onehot: jaso_splitted})
        batch_label = LABEL[INDEX]

        return batch_input, batch_label, seq_len

    else:
        jaso_splitted = jmu.jaso_split(DOC[INDEX], MAXLEN=MAXLEN)
        _input = SESS.run(MODEL.jaso_Onehot, {MODEL.X_Onehot: jaso_splitted})
        _, del_list = length(_input)
        _label = LABEL[INDEX]
        batch_input = np.delete(_input, del_list, axis=0)
        batch_label = np.delete(_label, del_list, axis=0)

        return batch_input, batch_label
def generate_batch_jaso(INDEX, MODEL, DOC, LABEL, MAXLEN, SESS):
    #print(INDEX.shape)
    jaso_splitted = jmu.jaso_split(DOC[INDEX], MAXLEN=MAXLEN)
    _input = SESS.run(MODEL.jaso_Onehot, {MODEL.X_Onehot: jaso_splitted})
    _, del_list = length(_input)
    _label = LABEL[INDEX].reshape(INDEX.shape[0], -1)

    if len(del_list) > 0:
        print(jaso_splitted[del_list[0]], del_list)

    print(_input.shape, _label.shape, del_list)
    batch_input = np.delete(_input, del_list, axis=0)
    batch_label = np.delete(_label, del_list, axis=0)

    return _input, _label
def main():
    args = parse_args()
    if args is None:
        exit()

    # test file
    filename = args.inputdir + args.input

    # init model config
    #TRAIN_DOC, TRAIN_LABEL, TEST_DOC, TEST_LABEL, LABEL_IDX = data_load.digi_data_load()
    TRAIN_DOC, TRAIN_LABEL, TEST_DOC, TEST_LABEL, LABEL_IDX = data_load.testcase_shuffle_data_load(
    )
    #data_load.testcase_add_data_load()
    class_num = TRAIN_LABEL.shape[1]
    FLAGS.NUM_OF_CLASS = class_num
    JM = utils.lookup_JM(FLAGS.INPUT_WIDTH, FLAGS.INPUT_DEPTH)

    # Start Session
    sess = tf.Session()
    print("Session Ready!")
    model = MODEL(sess=sess, JM=JM, FLAGS=FLAGS)

    # Initialization
    sess.run(tf.global_variables_initializer())
    model.JM.init_table(sess)

    # Restore parameter
    saver = tf.train.Saver()
    saver.restore(sess, "./Saver/{}/{}.ckpt".format(FLAGS.WRITER,
                                                    FLAGS.WRITER))

    if args.printtest == 'True':
        index = np.array(range(0, len(TEST_DOC)))
        batch_input, batch_label = utils.generate_batch_jaso(
            INDEX=index,
            MODEL=model,
            DOC=TEST_DOC,
            LABEL=TEST_LABEL,
            MAXLEN=FLAGS.INPUT_WIDTH,
            SESS=sess)

        proba, ts_loss, ts_acc, ts_merged = sess.run(
            [model.y_proba, model.cross_entropy, model.accuracy, model.merge],
            feed_dict={
                model.X: batch_input,
                model.Y: batch_label,
                model.LEARNING_RATE: FLAGS.lr_value,
                model.TRAIN_PH: False
            })

        pred_idx = np.apply_along_axis(np.argmax, 1, proba)
        real_idx = np.apply_along_axis(np.argmax, 1, batch_label)

        pos_idx = np.where(np.equal(pred_idx, real_idx) == True)[0]
        neg_idx = np.where(np.equal(pred_idx, real_idx) == False)[0]

        print('[ TEST ]')
        desc = """
            size:{}, correct:{}, wrong:{}, acc:{}, f1_score:{}, ts_loss:{}, ts_acc:{}
        """.format(index.shape[0], pos_idx.shape[0], neg_idx.shape[0],
                   round(pos_idx.shape[0] / index.shape[0] * 100, 3),
                   round(f1_score(real_idx, pred_idx, average='weighted'), 4),
                   ts_loss, ts_acc)
        print(desc)
        for idx in pos_idx:
            print('Positive Case:\t', TEST_DOC[index[idx]], '\t->\t',
                  LABEL_IDX[np.argmax(proba[idx])],
                  '({0:.2f})\t'.format(round(max(proba[idx]), 3)),
                  LABEL_IDX[np.argmax(batch_label[idx])])

        for idx in neg_idx:

            print(
                'Negative Case:\t', TEST_DOC[index[idx]], '\t->\t',
                LABEL_IDX[np.argmax(proba[idx])],
                '({0:.2f})\t'.format(round(max(proba[idx]), 3)),
                LABEL_IDX[np.argmax(batch_label[idx])],
                '({0:.2f})\t'.format(proba[idx][np.argmax(batch_label[idx])]))

        print()

    if args.input == 'a.txt':
        testfile = open(filename)
        for line in testfile:
            line = line.rstrip('\n\r').lower()

            jaso_splitted = jmu.jaso_split([line], MAXLEN=FLAGS.INPUT_WIDTH)
            batch_input = sess.run(model.jaso_Onehot,
                                   {model.X_Onehot: jaso_splitted})
            y_proba = sess.run(model.y_proba,
                               feed_dict={
                                   model.X: batch_input,
                                   model.TRAIN_PH: False
                               })
            #print(batch_input.shape, y_proba.shape)
            label = LABEL_IDX[np.argmax(y_proba[0])]
            if round(max(y_proba[0]), 3) > 0.8:
                print(line, '\t->\t', label, round(max(y_proba[0]), 3))

    if args.input == 'tmp.txt':
        tmp_df = pd.read_csv(filename, sep='\t', header=None)
        tmp_df['class'] = tmp_df[1].apply(lambda x: json2intent(x))
        sentences = tmp_df[0].tolist()

        labels = []
        probas = []
        for sent in sentences:
            jaso_splitted = jmu.jaso_split([sent], MAXLEN=FLAGS.INPUT_WIDTH)
            batch_input = sess.run(model.jaso_Onehot,
                                   {model.X_Onehot: jaso_splitted})
            y_proba = sess.run(model.y_proba,
                               feed_dict={
                                   model.X: batch_input,
                                   model.TRAIN_PH: False
                               })
            # print(batch_input.shape, y_proba.shape)
            label = LABEL_IDX[np.argmax(y_proba[0])]
            labels.append(label)
            probas.append(round(max(y_proba[0]), 3))

        tmp_df['pred'] = labels
        tmp_df['proba'] = probas

        for index, row in tmp_df.iterrows():
            # if row['class'] is None:
            #     continue

            if row['class'] is not None:
                continue

            if round(row['proba'], 3) > 0.8:
                print(row[0], '\t->\t', row['pred'], round(row['proba'], 3),
                      '\t', row['class'])

    if args.input == 'tmp_not_under.txt':
        tmp_df = pd.read_csv(filename, sep='\t', header=None)
        tmp_df['class'] = tmp_df[1].apply(lambda x: json2intent(x))
        sentences = tmp_df[0].tolist()

        labels = []
        probas = []
        for sent in sentences:
            jaso_splitted = jmu.jaso_split([sent], MAXLEN=FLAGS.INPUT_WIDTH)
            batch_input = sess.run(model.jaso_Onehot,
                                   {model.X_Onehot: jaso_splitted})
            y_proba = sess.run(model.y_proba,
                               feed_dict={
                                   model.X: batch_input,
                                   model.TRAIN_PH: False
                               })
            # print(batch_input.shape, y_proba.shape)
            label = LABEL_IDX[np.argmax(y_proba[0])]
            labels.append(label)
            probas.append(round(max(y_proba[0]), 3))

        tmp_df['pred'] = labels
        tmp_df['proba'] = probas

        for index, row in tmp_df.iterrows():

            if round(row['proba'], 3) > 0.1:
                print(row[1])
                print(row[0], '\t->\t', row['pred'], round(row['proba'], 3),
                      '\t')
                print()
Пример #8
0
################################################################################
# Get performance scores
################################################################################
from sklearn import metrics
from sklearn.metrics import confusion_matrix

# Calculate logits
LOGIT_list = np.empty([0, 2])
LABEL_list = np.empty([0, 2])
for i in range(int(len(TEST_DOC) / FLAGS.TEST_BATCH) + 1):
    index = np.unique(
        np.clip(np.arange(i * FLAGS.TEST_BATCH, (i + 1) * FLAGS.TEST_BATCH),
                a_min=0,
                a_max=len(TEST_DOC) - 1))
    jaso_splitted = jmu.jaso_split(TEST_DOC[index], MAXLEN=FLAGS.MAXLEN)
    batch_input = sess.run(model.jaso_Onehot, {model.X_Onehot: jaso_splitted})
    batch_label = TEST_LABEL[index]
    ts_acc, y_logit = sess.run([model.accuracy, model.y_logits],
                               feed_dict={
                                   model.X: batch_input,
                                   model.Y: batch_label,
                                   model.TRAIN_PH: False
                               })
    LOGIT_list = np.concatenate([LOGIT_list, y_logit])
    LABEL_list = np.concatenate([LABEL_list, batch_label])
    print(i, '||', ts_acc)

# Calculate AUROC, accuracy from confusion matrix
softmax_logit = np.array(
    list(map(lambda x: np.exp(x) / sum(np.exp(x)), LOGIT_list)))
Пример #9
0
################################################################################
# Get performance scores
################################################################################
from sklearn import metrics
from sklearn.metrics import confusion_matrix

# Calculate logits
LOGIT_list = np.empty([0, 2])
LABEL_list = np.empty([0, 2])
for i in range(int(len(TEST_DOC) / FLAGS.TEST_BATCH) + 1):
    index = np.unique(
        np.clip(np.arange(i * FLAGS.TEST_BATCH, (i + 1) * FLAGS.TEST_BATCH),
                a_min=0,
                a_max=len(TEST_DOC) - 1))
    jaso_splitted = jmu.jaso_split(TEST_DOC[index], MAXLEN=FLAGS.INPUT_WIDTH)
    batch_input = sess.run(model.jaso_Onehot, {model.X_Onehot: jaso_splitted})
    batch_label = TEST_LABEL[index]
    ts_acc, y_logit = sess.run([model.accuracy, model.y_logits],
                               feed_dict={
                                   model.X: batch_input,
                                   model.Y: batch_label,
                                   model.TRAIN_PH: False
                               })
    LOGIT_list = np.concatenate([LOGIT_list, y_logit])
    LABEL_list = np.concatenate([LABEL_list, batch_label])
    print(i, '||', ts_acc)

# Calculate AUROC, accuracy from confusion matrix
softmax_logit = np.array(
    list(map(lambda x: np.exp(x) / sum(np.exp(x)), LOGIT_list)))