def PBLM_CNN(src, dest, pivot_num, max_review_len, embedding_vecor_length_rep, topWords, hidden_units_num_rep, filters, kernel_size, iter_num, criteria): model_path = src + "_to_" + dest + "/models/" + criteria + "-" + str( iter_num) + "/" + src + "_" + dest + "_" + str(pivot_num) + "_" + str( hidden_units_num_rep) + "_" + str( embedding_vecor_length_rep) + "_" + ".model" + "." + str( iter_num - 1) model = load_model(model_path) split_dir = src + "_to_" + dest # gets all the train and test for sentiment classification with open(split_dir + "/split/train", 'rb') as f: train = pickle.load(f) with open(split_dir + "/split/test", 'rb') as f: val = pickle.load(f) unlabeled, source, target = pre.XML2arrayRAW( "data/" + src + "/" + src + "UN.txt", "data/" + dest + "/" + dest + "UN.txt") dest_test, source, target = XML2arrayRAW( "data/" + dest + "/negative.parsed", "data/" + dest + "/positive.parsed") unlabeled = getClearList(unlabeled) train = getClearList(train) tok = Tokenizer(num_words=topWords, split=" ") tok.fit_on_texts(train + unlabeled) train_count = 800 X_train = tok.texts_to_sequences(train) X_train = sequence.pad_sequences(X_train, maxlen=max_review_len) Y_train = [0] * train_count + [1] * train_count val = getClearList(val) X_val = tok.texts_to_sequences(val) X_val = sequence.pad_sequences(X_val, maxlen=max_review_len) val_count = 200 Y_val = [0] * val_count + [1] * val_count dest_test = getClearList(dest_test) X_test = tok.texts_to_sequences(dest_test) X_test = sequence.pad_sequences(X_test, maxlen=max_review_len) test_count = 1000 Y_test = [0] * test_count + [1] * test_count #loading the PBLM model without the softmax layer modelT = Sequential() for i in range(len(model.layers) - 1): modelT.add(model.layers[i]) modelT.layers[i].trainable = False modelT.layers[i].mask_zero = False modelT.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy']) print modelT.summary() #getting the input vectors, for more information read the "partial" function comments X_test = partial(modelT, X_test) X_train = partial(modelT, X_train) X_val = partial(modelT, X_val) print "train shape ", X_train.shape print "val shape ", X_val.shape print "test shape ", X_test.shape train_data = X_train val_data = X_val test_data = X_test sent_model = Sequential() sent_model.add( Conv1D(filters, kernel_size, padding='valid', activation='relu', input_shape=(max_review_len, hidden_units_num_rep))) # we use max pooling: sent_model.add(GlobalMaxPooling1D()) sent_model.add(Dense(1, activation='sigmoid')) sent_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) print sent_model.layers print(sent_model.summary()) model_str = src + "_to_" + dest + "/sent_models_cnn/" + criteria + "-" + str( iter_num) + "/model_" + str(pivot_num) + "_" + str( hidden_units_num_rep) + "_.model" filename = model_str if not os.path.exists(os.path.dirname(filename)): os.makedirs(os.path.dirname(filename)) #stops as soon as the validation loss stops decreasing modelCheckpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='min', period=1) # saving only the best model earlyStopping = EarlyStopping(monitor='val_loss', patience=2, mode='min') sent_model.fit(train_data, Y_train, validation_data=(val_data, Y_val), epochs=10, batch_size=16, callbacks=[earlyStopping, modelCheckpoint]) print(sent_model.summary()) print sent_model.get_config() sent_model = load_model(filename) val_score, val_acc = sent_model.evaluate(val_data, Y_val, batch_size=16) print('val loss:', val_score) print('val accuracy:', val_acc) test_score, test_acc = sent_model.evaluate(test_data, Y_test, batch_size=16) print('Test loss:', test_score) print('Test accuracy:', test_acc) score_path = src + "_to_" + dest + "/results/" + criteria + "-" + str( iter_num) + "/cnn/results.txt" sentence = "pivots = " + str(pivot_num) + " HU rep " + str( hidden_units_num_rep) + " word rep size " + str( embedding_vecor_length_rep) + " the val acc " + str( val_acc) + " test acc " + str(test_acc) if not os.path.exists(os.path.dirname(score_path)): os.makedirs(os.path.dirname(score_path)) with open(score_path, "a") as myfile: myfile.write(sentence + "\n")
def train_PBLM(src, dest, pivot_num, pivot_min_st, word_vector_size, topWords, max_review_len, hidden_units_num, names, current_iter, iter_num, criteria): split_dir = src + "_to_" + dest # gets all the train sentiment classification with open(split_dir + "/split/train", 'rb') as f: train = pickle.load(f) unlabeled, source, target = pre.XML2arrayRAW( "data/" + src + "/" + src + "UN.txt", "data/" + dest + "/" + dest + "UN.txt") unlabeled = getClearList(unlabeled) train = getClearList(train) source_valid = len(source) / 5 target_valid = len(target) / 5 tok = Tokenizer(num_words=topWords, split=" ") tok.fit_on_texts(train + unlabeled) x_valid = unlabeled[:source_valid] + unlabeled[-target_valid:] x = unlabeled[source_valid:-target_valid] + train #you can reload the pivots if you want to avoid the pivot extraction ''' filename =src + "_to_" + dest + "/pivots/"+str(pivot_num) with open(filename, 'rb') as f: names = pickle.load(f) ''' filename = src + "_to_" + dest + "/pivots/" + str(pivot_num) + "_meta" with open(filename, 'rb') as f: meta = pickle.load(f) revel_num = pivot_num / (iter_num - 1) print "revel ", revel_num * (current_iter) print "the names ", names print "the hiddens ", names[revel_num * (current_iter):] Pdict, class_count = fill_pivot_dict(names, names[revel_num * (current_iter):], meta) X_train = tok.texts_to_sequences(x) X_test = tok.texts_to_sequences(x_valid) #creates the model embedding_vecor_length = word_vector_size model = Sequential() model.add( Embedding(topWords, embedding_vecor_length, mask_zero=True, embeddings_initializer="glorot_uniform", input_length=max_review_len)) model.add(LSTM(hidden_units_num, return_sequences=True)) model_str = src + "_to_" + dest + "/models/" + criteria + "-" + str( iter_num) + "/" + src + "_" + dest + "_" + str(pivot_num) + "_" + str( hidden_units_num) + "_" + str(word_vector_size) + "_" + ".model" num_class = class_count + 2 model.add(TimeDistributed(Dense(num_class, activation='softmax'))) if (current_iter != 0): old_model_str = model_str + "." + str(current_iter - 1) model = GetNewModel(old_model_str, model, names, meta) new_model_str = model_str + "." + str(current_iter) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'], sample_weight_mode="temporal") print(model.summary()) if not os.path.exists(os.path.dirname(new_model_str)): os.makedirs(os.path.dirname(new_model_str)) #saves only the best model with respect to the validaion loss modelCheckpoint = ModelCheckpoint(new_model_str, monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='min', period=1) #stops the training if the validation loss has not decreased during the last 2 epochs earlyStopping = EarlyStopping(monitor='val_loss', patience=2, mode='min') model.fit_generator(generator(X_train, 16, names, Pdict, tok, max_review_len, class_count), steps_per_epoch=(len(X_train) / 16), epochs=10, validation_data=generator_val(X_test, 16, names, Pdict, tok, max_review_len, class_count), validation_steps=(len(X_test) / 16), callbacks=[earlyStopping, modelCheckpoint])