def train_model(train_x, train_y, embedding_meta_data, filename): class Configuration(object): """Dump stuff here""" CONFIG = Configuration() CONFIG.embedding_dim = siamese_config['EMBEDDING_DIM'] CONFIG.max_sequence_length = siamese_config['MAX_SEQUENCE_LENGTH'] CONFIG.number_lstm_units = siamese_config['NUMBER_LSTM'] CONFIG.rate_drop_lstm = siamese_config['RATE_DROP_LSTM'] CONFIG.number_dense_units = siamese_config['NUMBER_DENSE_UNITS'] CONFIG.activation_function = siamese_config['ACTIVATION_FUNCTION'] CONFIG.rate_drop_dense = siamese_config['RATE_DROP_DENSE'] CONFIG.validation_split_ratio = siamese_config['VALIDATION_SPLIT'] siamese = SiameseBiLSTM(CONFIG.embedding_dim, CONFIG.max_sequence_length, CONFIG.number_lstm_units, CONFIG.number_dense_units, CONFIG.rate_drop_lstm, CONFIG.rate_drop_dense, CONFIG.activation_function, CONFIG.validation_split_ratio) best_model_path = siamese.train_model( train_x, train_y, embedding_meta_data, filename, model_save_directory='../data/model/siamese-lstm/') return best_model_path
def testing1(best_model_path): sentences1, sentences2, class1, train_pair = make_new_train_data() test_pair = make_test_data() #print('sentences1' , sentences1) tokenizer, embedding_matrix = word_embed_meta_data( sentences1 + sentences2, siamese_config['EMBEDDING_DIM']) embedding_meta_data = { 'tokenizer': tokenizer, 'embedding_matrix': embedding_matrix } class Configuration(object): """Dump stuff here""" CONFIG = Configuration() CONFIG.embedding_dim = siamese_config['EMBEDDING_DIM'] CONFIG.max_sequence_length = siamese_config['MAX_SEQUENCE_LENGTH'] CONFIG.number_lstm_units = siamese_config['NUMBER_LSTM'] CONFIG.rate_drop_lstm = siamese_config['RATE_DROP_LSTM'] CONFIG.number_dense_units = siamese_config['NUMBER_DENSE_UNITS'] CONFIG.activation_function = siamese_config['ACTIVATION_FUNCTION'] CONFIG.rate_drop_dense = siamese_config['RATE_DROP_DENSE'] CONFIG.validation_split_ratio = siamese_config['VALIDATION_SPLIT'] #print('go to siamese') siamese = SiameseBiLSTM(CONFIG.embedding_dim, CONFIG.max_sequence_length, CONFIG.number_lstm_units, CONFIG.number_dense_units, CONFIG.rate_drop_lstm, CONFIG.rate_drop_dense, CONFIG.activation_function, CONFIG.validation_split_ratio) best_model_path = siamese.update_model(best_model_path, train_pair, class1, embedding_meta_data) #print(best_model_path) from operator import itemgetter from keras.models import load_model model = load_model(best_model_path) test_data_x1, test_data_x2, leaks_test = create_test_data( tokenizer, test_pair, siamese_config['MAX_SEQUENCE_LENGTH']) preds = list( model.predict([test_data_x1, test_data_x2, leaks_test], verbose=1).ravel()) results = [(x, y, z) for (x, y), z in zip(test_pair, preds)] results.sort(key=itemgetter(2), reverse=True) #print(results) #print(preds) return results, preds
def testing1(best_model_path): #making the training data text1, text2, class1, train_pair = make_new_train_data() #making test data test_pair = make_test_data() #making tokenizer and emedding matrix tokenizer, embedding_matrix = word_embed_meta_data( text1 + text2, siamese_config['EMBEDDING_DIM']) embedding_meta_data = { 'tokenizer': tokenizer, 'embedding_matrix': embedding_matrix } class Configuration(object): """Dump stuff here""" CONFIG = Configuration() # setting configuration for the model CONFIG.embedding_dim = siamese_config['EMBEDDING_DIM'] CONFIG.max_sequence_length = siamese_config['MAX_SEQUENCE_LENGTH'] CONFIG.number_lstm_units = siamese_config['NUMBER_LSTM'] CONFIG.rate_drop_lstm = siamese_config['RATE_DROP_LSTM'] CONFIG.number_dense_units = siamese_config['NUMBER_DENSE_UNITS'] CONFIG.activation_function = siamese_config['ACTIVATION_FUNCTION'] CONFIG.rate_drop_dense = siamese_config['RATE_DROP_DENSE'] CONFIG.validation_split_ratio = siamese_config['VALIDATION_SPLIT'] # making siamese network siamese = SiameseBiLSTM(CONFIG.embedding_dim, CONFIG.max_sequence_length, CONFIG.number_lstm_units, CONFIG.number_dense_units, CONFIG.rate_drop_lstm, CONFIG.rate_drop_dense, CONFIG.activation_function, CONFIG.validation_split_ratio) #updating the pretrained model and saving it into the model. best_model_path = siamese.update_model(best_model_path, train_pair, class1, embedding_meta_data) # loading the best updated model model = load_model(best_model_path) # creatng text data as per requirement test_data_x1, test_data_x2, leaks_test = create_test_data( tokenizer, test_pair, siamese_config['MAX_SEQUENCE_LENGTH']) # storing results of test data in the preds varibale preds = list( model.predict([test_data_x1, test_data_x2, leaks_test], verbose=1).ravel()) ''' storing the results in the following form: known1, unknown, result2 known2, unknown, result2 known3, unknown , result3 ''' results = [(x, y, z) for (x, y), z in zip(test_pair, preds)] results.sort(key=itemgetter(2), reverse=True) return results, preds
def Train(languege='Persian'): question_pre = load_data(pre_path_files+languege+'/question_pre') answering_pre = load_data(pre_path_files+languege+'/answering_pre') is_similar = load_data(pre_path_files+languege+'/is_similar') tokenizer = load_data(pre_path_files+languege+'/Word2Vec/tokenizer') embedding_matrix = load_data(pre_path_files+languege+'/Word2Vec/embedding_matrix') embedding_meta_data = { 'tokenizer': tokenizer, 'embedding_matrix': embedding_matrix } sentences_pair = [(x1, x2) for x1, x2 in zip(question_pre, answering_pre)] siamese = SiameseBiLSTM(CONFIG.embedding_dim, CONFIG.max_sequence_length, CONFIG.number_lstm_units, CONFIG.number_dense_units, CONFIG.rate_drop_lstm, CONFIG.rate_drop_dense, CONFIG.activation_function, CONFIG.validation_split_ratio) siamese.train_model(sentences_pair, is_similar, embedding_meta_data, model_save_directory=pre_path_files+languege+'/'+ 'Model/')
def train_model(sentences1, sentences2 , is_similar): sentences_pair = [(x1, x2) for x1, x2 in zip(sentences1, sentences2)] class Configuration(object): """""" CONFIG = Configuration() CONFIG.embedding_dim = siamese_config['EMBEDDING_DIM']#100d CONFIG.max_sequence_length = siamese_config['MAX_SEQUENCE_LENGTH'] #20 CONFIG.number_lstm_units = siamese_config['NUMBER_LSTM'] #100 CONFIG.rate_drop_lstm = siamese_config['RATE_DROP_LSTM'] #0.2 CONFIG.number_dense_units = siamese_config['NUMBER_DENSE_UNITS'] #100 CONFIG.activation_function = siamese_config['ACTIVATION_FUNCTION'] #relu CONFIG.rate_drop_dense = siamese_config['RATE_DROP_DENSE'] #0.2 CONFIG.validation_split_ratio = siamese_config['VALIDATION_SPLIT'] #0.15 siamese = SiameseBiLSTM(CONFIG.embedding_dim , CONFIG.max_sequence_length, CONFIG.number_lstm_units , CONFIG.number_dense_units, CONFIG.rate_drop_lstm, CONFIG.rate_drop_dense, CONFIG.activation_function, CONFIG.validation_split_ratio) best_model_path = siamese.train_model(sentences_pair, is_similar, embedding_meta_data, model_save_directory='./') return best_model_path
def Update_Train(languege,question,answer): if (languege == 0): question_pre = load_data(pre_path_files + 'Persian/question_pre') answering_pre = load_data(pre_path_files + 'Persian/answering_pre') is_similar = load_data(pre_path_files + 'Persian/is_similar') tokenizer = load_data(pre_path_files + 'Persian/tokenizer') embedding_matrix = load_data(pre_path_files + 'Persian/embedding_matrix') embedding_meta_data = { 'tokenizer': tokenizer, 'embedding_matrix': embedding_matrix } sentences_pair = [(x1, x2) for x1, x2 in zip(question_pre, answering_pre)] siamese = SiameseBiLSTM(CONFIG.embedding_dim, CONFIG.max_sequence_length, CONFIG.number_lstm_units, CONFIG.number_dense_units, CONFIG.rate_drop_lstm, CONFIG.rate_drop_dense, CONFIG.activation_function, CONFIG.validation_split_ratio) best_model_path = siamese.train_model(sentences_pair, is_similar, embedding_meta_data, model_save_directory=pre_path_files + 'Persian/') elif (languege == 1): question_pre = load_data(pre_path_files + 'English/question_pre') answering_pre = load_data(pre_path_files + 'English/answering_pre') is_similar = load_data(pre_path_files + 'English/is_similar') tokenizer = load_data(pre_path_files + 'English/tokenizer') embedding_matrix = load_data(pre_path_files + 'English/embedding_matrix') embedding_meta_data = { 'tokenizer': tokenizer, 'embedding_matrix': embedding_matrix } sentences_pair = [(x1, x2) for x1, x2 in zip(question_pre, answering_pre)] siamese = SiameseBiLSTM(CONFIG.embedding_dim, CONFIG.max_sequence_length, CONFIG.number_lstm_units, CONFIG.number_dense_units, CONFIG.rate_drop_lstm, CONFIG.rate_drop_dense, CONFIG.activation_function, CONFIG.validation_split_ratio) best_model_path = siamese.train_model(sentences_pair, is_similar, embedding_meta_data, model_save_directory=pre_path_files + 'English/')
class Configuration(object): """Dump stuff here""" CONFIG = Configuration() CONFIG.embedding_dim = siamese_config['EMBEDDING_DIM'] CONFIG.max_sequence_length = siamese_config['MAX_SEQUENCE_LENGTH'] CONFIG.number_lstm_units = siamese_config['NUMBER_LSTM'] CONFIG.rate_drop_lstm = siamese_config['RATE_DROP_LSTM'] CONFIG.number_dense_units = siamese_config['NUMBER_DENSE_UNITS'] CONFIG.activation_function = siamese_config['ACTIVATION_FUNCTION'] CONFIG.rate_drop_dense = siamese_config['RATE_DROP_DENSE'] CONFIG.validation_split_ratio = siamese_config['VALIDATION_SPLIT'] siamese = SiameseBiLSTM(CONFIG.embedding_dim , CONFIG.max_sequence_length, CONFIG.number_lstm_units , CONFIG.number_dense_units, CONFIG.rate_drop_lstm, CONFIG.rate_drop_dense, CONFIG.activation_function, CONFIG.validation_split_ratio) best_model_path = siamese.train_model(sentences_pair, is_similar, embedding_meta_data, model_save_directory='./') ######################## ###### Testing ######### ######################## def test(test_pair): model = load_model(best_model_path) test_sentence_pairs = test_pair bias1=random.uniform(0.28123,0.3156) bias2=random.uniform(0.07682,0.09354) test_data_x1, test_data_x2, leaks_test = create_test_data(tokenizer,test_sentence_pairs, siamese_config['MAX_SEQUENCE_LENGTH'])