예제 #1
0
def PBLM_CNN(src, dest, pivot_num, max_review_len, embedding_vecor_length_rep,
             topWords, hidden_units_num_rep, filters, kernel_size, iter_num,
             criteria):
    model_path = src + "_to_" + dest + "/models/" + criteria + "-" + str(
        iter_num) + "/" + src + "_" + dest + "_" + str(pivot_num) + "_" + str(
            hidden_units_num_rep) + "_" + str(
                embedding_vecor_length_rep) + "_" + ".model" + "." + str(
                    iter_num - 1)
    model = load_model(model_path)
    split_dir = src + "_to_" + dest
    # gets all the train and test for sentiment classification
    with open(split_dir + "/split/train", 'rb') as f:
        train = pickle.load(f)
    with open(split_dir + "/split/test", 'rb') as f:
        val = pickle.load(f)

    unlabeled, source, target = pre.XML2arrayRAW(
        "data/" + src + "/" + src + "UN.txt",
        "data/" + dest + "/" + dest + "UN.txt")

    dest_test, source, target = XML2arrayRAW(
        "data/" + dest + "/negative.parsed",
        "data/" + dest + "/positive.parsed")
    unlabeled = getClearList(unlabeled)
    train = getClearList(train)
    tok = Tokenizer(num_words=topWords, split=" ")
    tok.fit_on_texts(train + unlabeled)
    train_count = 800
    X_train = tok.texts_to_sequences(train)
    X_train = sequence.pad_sequences(X_train, maxlen=max_review_len)
    Y_train = [0] * train_count + [1] * train_count
    val = getClearList(val)
    X_val = tok.texts_to_sequences(val)
    X_val = sequence.pad_sequences(X_val, maxlen=max_review_len)
    val_count = 200
    Y_val = [0] * val_count + [1] * val_count
    dest_test = getClearList(dest_test)
    X_test = tok.texts_to_sequences(dest_test)
    X_test = sequence.pad_sequences(X_test, maxlen=max_review_len)
    test_count = 1000
    Y_test = [0] * test_count + [1] * test_count
    #loading the PBLM model without the softmax layer
    modelT = Sequential()
    for i in range(len(model.layers) - 1):
        modelT.add(model.layers[i])
        modelT.layers[i].trainable = False
        modelT.layers[i].mask_zero = False
    modelT.compile(optimizer='rmsprop',
                   loss='categorical_crossentropy',
                   metrics=['accuracy'])
    print modelT.summary()

    #getting the input vectors, for more information read the "partial" function comments
    X_test = partial(modelT, X_test)
    X_train = partial(modelT, X_train)
    X_val = partial(modelT, X_val)

    print "train shape ", X_train.shape
    print "val shape ", X_val.shape
    print "test shape ", X_test.shape

    train_data = X_train
    val_data = X_val
    test_data = X_test

    sent_model = Sequential()

    sent_model.add(
        Conv1D(filters,
               kernel_size,
               padding='valid',
               activation='relu',
               input_shape=(max_review_len, hidden_units_num_rep)))
    # we use max pooling:
    sent_model.add(GlobalMaxPooling1D())
    sent_model.add(Dense(1, activation='sigmoid'))
    sent_model.compile(loss='binary_crossentropy',
                       optimizer='adam',
                       metrics=['accuracy'])
    print sent_model.layers
    print(sent_model.summary())

    model_str = src + "_to_" + dest + "/sent_models_cnn/" + criteria + "-" + str(
        iter_num) + "/model_" + str(pivot_num) + "_" + str(
            hidden_units_num_rep) + "_.model"
    filename = model_str
    if not os.path.exists(os.path.dirname(filename)):
        os.makedirs(os.path.dirname(filename))
    #stops as soon as the validation loss stops decreasing
    modelCheckpoint = ModelCheckpoint(filename,
                                      monitor='val_loss',
                                      verbose=0,
                                      save_best_only=True,
                                      save_weights_only=False,
                                      mode='min',
                                      period=1)
    # saving only the best model
    earlyStopping = EarlyStopping(monitor='val_loss', patience=2, mode='min')
    sent_model.fit(train_data,
                   Y_train,
                   validation_data=(val_data, Y_val),
                   epochs=10,
                   batch_size=16,
                   callbacks=[earlyStopping, modelCheckpoint])
    print(sent_model.summary())
    print sent_model.get_config()
    sent_model = load_model(filename)
    val_score, val_acc = sent_model.evaluate(val_data, Y_val, batch_size=16)
    print('val loss:', val_score)
    print('val accuracy:', val_acc)
    test_score, test_acc = sent_model.evaluate(test_data,
                                               Y_test,
                                               batch_size=16)
    print('Test loss:', test_score)
    print('Test accuracy:', test_acc)

    score_path = src + "_to_" + dest + "/results/" + criteria + "-" + str(
        iter_num) + "/cnn/results.txt"
    sentence = "pivots = " + str(pivot_num) + " HU rep " + str(
        hidden_units_num_rep) + " word rep size " + str(
            embedding_vecor_length_rep) + " the val acc " + str(
                val_acc) + " test acc " + str(test_acc)

    if not os.path.exists(os.path.dirname(score_path)):
        os.makedirs(os.path.dirname(score_path))

    with open(score_path, "a") as myfile:
        myfile.write(sentence + "\n")
예제 #2
0
파일: tr.py 프로젝트: GAIMJKP/TRL-PBLM
def train_PBLM(src, dest, pivot_num, pivot_min_st, word_vector_size, topWords,
               max_review_len, hidden_units_num, names, current_iter, iter_num,
               criteria):

    split_dir = src + "_to_" + dest
    # gets all the train sentiment classification
    with open(split_dir + "/split/train", 'rb') as f:
        train = pickle.load(f)

    unlabeled, source, target = pre.XML2arrayRAW(
        "data/" + src + "/" + src + "UN.txt",
        "data/" + dest + "/" + dest + "UN.txt")
    unlabeled = getClearList(unlabeled)
    train = getClearList(train)
    source_valid = len(source) / 5
    target_valid = len(target) / 5
    tok = Tokenizer(num_words=topWords, split=" ")
    tok.fit_on_texts(train + unlabeled)
    x_valid = unlabeled[:source_valid] + unlabeled[-target_valid:]
    x = unlabeled[source_valid:-target_valid] + train

    #you can reload the pivots if you want to avoid the pivot extraction
    '''
    filename =src + "_to_" + dest + "/pivots/"+str(pivot_num)

    with open(filename, 'rb') as f:
        names = pickle.load(f)
    '''
    filename = src + "_to_" + dest + "/pivots/" + str(pivot_num) + "_meta"
    with open(filename, 'rb') as f:
        meta = pickle.load(f)
    revel_num = pivot_num / (iter_num - 1)
    print "revel ", revel_num * (current_iter)
    print "the names   ", names
    print "the hiddens ", names[revel_num * (current_iter):]

    Pdict, class_count = fill_pivot_dict(names,
                                         names[revel_num * (current_iter):],
                                         meta)
    X_train = tok.texts_to_sequences(x)
    X_test = tok.texts_to_sequences(x_valid)

    #creates the model
    embedding_vecor_length = word_vector_size
    model = Sequential()
    model.add(
        Embedding(topWords,
                  embedding_vecor_length,
                  mask_zero=True,
                  embeddings_initializer="glorot_uniform",
                  input_length=max_review_len))
    model.add(LSTM(hidden_units_num, return_sequences=True))
    model_str = src + "_to_" + dest + "/models/" + criteria + "-" + str(
        iter_num) + "/" + src + "_" + dest + "_" + str(pivot_num) + "_" + str(
            hidden_units_num) + "_" + str(word_vector_size) + "_" + ".model"
    num_class = class_count + 2
    model.add(TimeDistributed(Dense(num_class, activation='softmax')))

    if (current_iter != 0):
        old_model_str = model_str + "." + str(current_iter - 1)
        model = GetNewModel(old_model_str, model, names, meta)

    new_model_str = model_str + "." + str(current_iter)
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'],
                  sample_weight_mode="temporal")
    print(model.summary())
    if not os.path.exists(os.path.dirname(new_model_str)):
        os.makedirs(os.path.dirname(new_model_str))
    #saves only the best model with respect to the validaion loss
    modelCheckpoint = ModelCheckpoint(new_model_str,
                                      monitor='val_loss',
                                      verbose=0,
                                      save_best_only=True,
                                      save_weights_only=False,
                                      mode='min',
                                      period=1)
    #stops the training if the validation loss has not decreased during the last 2 epochs
    earlyStopping = EarlyStopping(monitor='val_loss', patience=2, mode='min')
    model.fit_generator(generator(X_train, 16, names, Pdict, tok,
                                  max_review_len, class_count),
                        steps_per_epoch=(len(X_train) / 16),
                        epochs=10,
                        validation_data=generator_val(X_test, 16, names, Pdict,
                                                      tok, max_review_len,
                                                      class_count),
                        validation_steps=(len(X_test) / 16),
                        callbacks=[earlyStopping, modelCheckpoint])