Exemplo n.º 1
0
def main():
    # Check Gpu Enable
    physical_devices = tf.config.experimental.list_physical_devices('GPU')
    if len(physical_devices) > 0:
        tf.config.experimental.set_memory_growth(physical_devices[0], True)

    # parsing Arg
    train_data_path = args.train_data
    max_len = args.max_length
    epoch = args.epochs
    batch_size = args.batch_size
    att_reg = args.att_reg
    lr_decay = args.lr_decay
    lr_rate = args.lr_rate
    warmup_lr_rate = lr_rate*0.1
    patience = args.patience
    #period = args.save_period
    weight_save_path = args.weight_save_path
    document = args.document
    label = args.label
    channel = args.channel
    steps_per_epoch = args.train_steps

    # model weight pah
    if os.path.isdir(weight_save_path) == False:
        os.mkdir(weight_save_path)

    # Read Data
    if ".csv" in train_data_path:
        read_data = pd.read_csv
    elif ".xlsx" in train_data_path:
        read_data = pd.read_excel
    else:
        read_data = pd.read_table
    train_data = read_data(train_data_path)

    # Make Tokenizer Token
    tk = Token("Tokenizer", max_len)
    train_data["Token"] = train_data[document].apply(lambda x: tk.make_token_ori(x))

    # Using Keras Tokenizer
    k_tokenizer = keras.preprocessing.text.Tokenizer(filters='')
    k_tokenizer.fit_on_texts(train_data["Token"].values.tolist())
    words_count = len(k_tokenizer.word_counts)
    print("Save Keras tokenizer for validate in %s"%(weight_save_path))
    with open(os.path.join(weight_save_path,"keras_tokenizer.pkl"), "wb") as f:
        pickle.dump(k_tokenizer, f)
    # Load Pre-trained embedding Word2Vec
    w2v_model = word2vec.Word2Vec.load("w2v_pretrain_emb/w2v_20M_500.model")
    init_weight = np.random.uniform(size=(words_count + 1, 500), low=-1, high=1)

    words_lst = []
    for i in range(1, len(k_tokenizer.index_word) + 1):
        words = k_tokenizer.index_word[i]
        try:
            words_vector = w2v_model.wv[words]
        except:
            words_lst.append([i, words])
        init_weight[i] = words_vector

    #  K_tokenizer Sequence
    sequences = k_tokenizer.texts_to_sequences(train_data['Token'])
    x_train = keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_len)
    y_train = train_data[label].values

    # Define validation set 분할
    x_train2, x_val, y_train2, y_val = train_test_split(x_train, y_train, test_size=0.1, random_state=0)

    # Build simple binary model
    tf.keras.backend.clear_session()
    amcnn = AMCNN(maxlen=max_len,
                  embed_dim=500,
                  words_count=words_count,
                  filter_size=50,
                  channel=channel,
                  mask_prob=0.5,
                  att_reg=att_reg)
    model = amcnn.build(emb_trainable=False, emb_weight=init_weight)

    model.compile(optimizer=tf.keras.optimizers.Adam(warmup_lr_rate), loss="binary_crossentropy",
                         metrics=["accuracy", k_precision, k_recall, k_f1score])
    checkpoint_path = os.path.join(weight_save_path,"model-{epoch:04d}.h5")

    # Define callbacks condition
    callbacks = ModelCheckpoint(checkpoint_path, monitor='val_loss',
                                verbose=1,save_best_only=True, save_weights_only=True) #  period=period,


    # Define reduce Learning rate schedule
    lr_reducer = ReduceLROnPlateau(monitor='val_loss',
                                   factor=lr_decay,
                                   cooldown=0,
                                   patience=patience,
                                   min_lr=lr_rate*0.01,
                                   verbose=1)


    # Train Warm up stage
    print("===========Warm up %d Epoch Stage==========="%(int(epoch*0.1)))
    # warm up embedding weight
    model.fit(x_train2, y_train2, epochs=int(epoch*0.1), callbacks=[callbacks, lr_reducer], steps_per_epoch=steps_per_epoch,
                     batch_size=batch_size,verbose=2)
    print("============Main %d Epoch Stage============="%(epoch-int(epoch*0.1)))
    K.set_value(model.optimizer.learning_rate, lr_rate)
    model.fit(x_train2, y_train2, epochs=epoch-int(epoch * 0.1), callbacks=[callbacks, lr_reducer], steps_per_epoch=steps_per_epoch
              ,validation_steps=steps_per_epoch,
              batch_size=batch_size, validation_data=(x_val, y_val),verbose=2)
    print("Complete Training Model")
    print("Check Model Weight file in %s"%(weight_save_path))
Exemplo n.º 2
0
Arquivo: test.py Projeto: whjzsy/AMCNN
def main():
    # Check Gpu Enable
    physical_devices = tf.config.experimental.list_physical_devices('GPU')
    if len(physical_devices) > 0:
        tf.config.experimental.set_memory_growth(physical_devices[0], True)

    # parsing Arg
    test_data_path = args.test_data
    max_len = args.max_length
    att_reg = args.att_reg
    weight_save_path = args.weight_save_path
    document = args.document
    label = args.label
    channel = args.channel
    val_model_epoch = args.val_model_epoch

    # Read Data
    if ".csv" in test_data_path:
        read_data = pd.read_csv
    elif ".xlsx" in test_data_path:
        read_data = pd.read_excel
    else:
        read_data = pd.read_table
    test_data = read_data(test_data_path)

    # Make Tokenizer Token
    tk = Token("Tokenizer", max_len)
    test_data["Token"] = test_data[document].apply(
        lambda x: tk.make_token_ori(x))

    # Using Keras Tokenizer
    print("Load Keras tokenizer for validate in %s" % (weight_save_path))
    with open(os.path.join(weight_save_path, "keras_tokenizer.pkl"),
              "rb") as f:
        k_tokenizer = pickle.load(f)
    words_count = len(k_tokenizer.word_counts)

    #  K_tokenizer Sequence
    sequences = k_tokenizer.texts_to_sequences(test_data['Token'])
    x_test = keras.preprocessing.sequence.pad_sequences(sequences,
                                                        maxlen=max_len)
    y_test = test_data[label].values

    # Build simple binary model
    tf.keras.backend.clear_session()
    amcnn = AMCNN(maxlen=max_len,
                  embed_dim=500,
                  words_count=words_count,
                  filter_size=50,
                  channel=channel,
                  mask_prob=0.5,
                  att_reg=att_reg)
    model = amcnn.build(pre_emb=False)
    if val_model_epoch == -1:
        model_lst = [i for i in os.listdir(weight_save_path) if ".h5" in i]
        model_weight_path = model_lst[-1]
    else:
        model_weight_path = "model-%4d.h5" % (val_model_epoch)
        model_weight_path = model_weight_path.replace(" ", "0")
    model.load_weights(os.path.join(weight_save_path, model_weight_path))
    print("Evaluate %s Test data" %
          (os.path.join(weight_save_path, model_weight_path)))
    pred_test = model.predict(x_test, verbose=1)
    pred_test2 = np.int32(pred_test >= 0.5).reshape(-1)
    print("==============Evaluate Result============")
    print("f1_score :", f1_score(y_test, pred_test2))
    print("acc_score :", accuracy_score(y_test, pred_test2))
    print("recall_score :", recall_score(y_test, pred_test2))
    print("precision_score :", precision_score(y_test, pred_test2))