def train_model(train_x, train_y, embedding_meta_data, filename):
    class Configuration(object):
        """Dump stuff here"""

    CONFIG = Configuration()

    CONFIG.embedding_dim = siamese_config['EMBEDDING_DIM']
    CONFIG.max_sequence_length = siamese_config['MAX_SEQUENCE_LENGTH']
    CONFIG.number_lstm_units = siamese_config['NUMBER_LSTM']
    CONFIG.rate_drop_lstm = siamese_config['RATE_DROP_LSTM']
    CONFIG.number_dense_units = siamese_config['NUMBER_DENSE_UNITS']
    CONFIG.activation_function = siamese_config['ACTIVATION_FUNCTION']
    CONFIG.rate_drop_dense = siamese_config['RATE_DROP_DENSE']
    CONFIG.validation_split_ratio = siamese_config['VALIDATION_SPLIT']

    siamese = SiameseBiLSTM(CONFIG.embedding_dim, CONFIG.max_sequence_length,
                            CONFIG.number_lstm_units,
                            CONFIG.number_dense_units, CONFIG.rate_drop_lstm,
                            CONFIG.rate_drop_dense, CONFIG.activation_function,
                            CONFIG.validation_split_ratio)

    best_model_path = siamese.train_model(
        train_x,
        train_y,
        embedding_meta_data,
        filename,
        model_save_directory='../data/model/siamese-lstm/')
    return best_model_path
def testing1(best_model_path):
    sentences1, sentences2, class1, train_pair = make_new_train_data()

    test_pair = make_test_data()

    #print('sentences1' , sentences1)
    tokenizer, embedding_matrix = word_embed_meta_data(
        sentences1 + sentences2, siamese_config['EMBEDDING_DIM'])

    embedding_meta_data = {
        'tokenizer': tokenizer,
        'embedding_matrix': embedding_matrix
    }

    class Configuration(object):
        """Dump stuff here"""

    CONFIG = Configuration()

    CONFIG.embedding_dim = siamese_config['EMBEDDING_DIM']
    CONFIG.max_sequence_length = siamese_config['MAX_SEQUENCE_LENGTH']
    CONFIG.number_lstm_units = siamese_config['NUMBER_LSTM']
    CONFIG.rate_drop_lstm = siamese_config['RATE_DROP_LSTM']
    CONFIG.number_dense_units = siamese_config['NUMBER_DENSE_UNITS']
    CONFIG.activation_function = siamese_config['ACTIVATION_FUNCTION']
    CONFIG.rate_drop_dense = siamese_config['RATE_DROP_DENSE']
    CONFIG.validation_split_ratio = siamese_config['VALIDATION_SPLIT']
    #print('go to siamese')
    siamese = SiameseBiLSTM(CONFIG.embedding_dim, CONFIG.max_sequence_length,
                            CONFIG.number_lstm_units,
                            CONFIG.number_dense_units, CONFIG.rate_drop_lstm,
                            CONFIG.rate_drop_dense, CONFIG.activation_function,
                            CONFIG.validation_split_ratio)

    best_model_path = siamese.update_model(best_model_path, train_pair, class1,
                                           embedding_meta_data)

    #print(best_model_path)
    from operator import itemgetter
    from keras.models import load_model

    model = load_model(best_model_path)

    test_data_x1, test_data_x2, leaks_test = create_test_data(
        tokenizer, test_pair, siamese_config['MAX_SEQUENCE_LENGTH'])

    preds = list(
        model.predict([test_data_x1, test_data_x2, leaks_test],
                      verbose=1).ravel())
    results = [(x, y, z) for (x, y), z in zip(test_pair, preds)]
    results.sort(key=itemgetter(2), reverse=True)
    #print(results)

    #print(preds)
    return results, preds
Exemplo n.º 3
0
def testing1(best_model_path):
    #making the training data
    text1, text2, class1, train_pair = make_new_train_data()
    #making test data
    test_pair = make_test_data()
    #making tokenizer and emedding matrix
    tokenizer, embedding_matrix = word_embed_meta_data(
        text1 + text2, siamese_config['EMBEDDING_DIM'])

    embedding_meta_data = {
        'tokenizer': tokenizer,
        'embedding_matrix': embedding_matrix
    }

    class Configuration(object):
        """Dump stuff here"""

    CONFIG = Configuration()
    # setting configuration for the model
    CONFIG.embedding_dim = siamese_config['EMBEDDING_DIM']
    CONFIG.max_sequence_length = siamese_config['MAX_SEQUENCE_LENGTH']
    CONFIG.number_lstm_units = siamese_config['NUMBER_LSTM']
    CONFIG.rate_drop_lstm = siamese_config['RATE_DROP_LSTM']
    CONFIG.number_dense_units = siamese_config['NUMBER_DENSE_UNITS']
    CONFIG.activation_function = siamese_config['ACTIVATION_FUNCTION']
    CONFIG.rate_drop_dense = siamese_config['RATE_DROP_DENSE']
    CONFIG.validation_split_ratio = siamese_config['VALIDATION_SPLIT']
    # making siamese network
    siamese = SiameseBiLSTM(CONFIG.embedding_dim, CONFIG.max_sequence_length,
                            CONFIG.number_lstm_units,
                            CONFIG.number_dense_units, CONFIG.rate_drop_lstm,
                            CONFIG.rate_drop_dense, CONFIG.activation_function,
                            CONFIG.validation_split_ratio)
    #updating the pretrained model and saving it into the model.
    best_model_path = siamese.update_model(best_model_path, train_pair, class1,
                                           embedding_meta_data)
    # loading the best updated model
    model = load_model(best_model_path)
    # creatng text data as per requirement
    test_data_x1, test_data_x2, leaks_test = create_test_data(
        tokenizer, test_pair, siamese_config['MAX_SEQUENCE_LENGTH'])
    # storing results of test data in the preds varibale
    preds = list(
        model.predict([test_data_x1, test_data_x2, leaks_test],
                      verbose=1).ravel())
    '''
	storing the results in the following form:
	known1, unknown, result2
	known2, unknown, result2
	known3, unknown , result3
	'''
    results = [(x, y, z) for (x, y), z in zip(test_pair, preds)]
    results.sort(key=itemgetter(2), reverse=True)
    return results, preds
Exemplo n.º 4
0
def Train(languege='Persian'):
        question_pre = load_data(pre_path_files+languege+'/question_pre')
        answering_pre = load_data(pre_path_files+languege+'/answering_pre')
        is_similar = load_data(pre_path_files+languege+'/is_similar')
        tokenizer = load_data(pre_path_files+languege+'/Word2Vec/tokenizer')
        embedding_matrix = load_data(pre_path_files+languege+'/Word2Vec/embedding_matrix')
        embedding_meta_data = {
            'tokenizer': tokenizer,
            'embedding_matrix': embedding_matrix
        }
        sentences_pair = [(x1, x2) for x1, x2 in zip(question_pre, answering_pre)]
        siamese = SiameseBiLSTM(CONFIG.embedding_dim, CONFIG.max_sequence_length, CONFIG.number_lstm_units,
                                CONFIG.number_dense_units, CONFIG.rate_drop_lstm, CONFIG.rate_drop_dense,
                                CONFIG.activation_function, CONFIG.validation_split_ratio)
        siamese.train_model(sentences_pair, is_similar, embedding_meta_data,
                                              model_save_directory=pre_path_files+languege+'/'+ 'Model/')
def train_model(sentences1, sentences2 , is_similar):

	sentences_pair = [(x1, x2) for x1, x2 in zip(sentences1, sentences2)]
	class Configuration(object):
	    """"""

	CONFIG = Configuration()

	CONFIG.embedding_dim = siamese_config['EMBEDDING_DIM']#100d
	CONFIG.max_sequence_length = siamese_config['MAX_SEQUENCE_LENGTH'] #20
	CONFIG.number_lstm_units = siamese_config['NUMBER_LSTM'] #100
	CONFIG.rate_drop_lstm = siamese_config['RATE_DROP_LSTM'] #0.2
	CONFIG.number_dense_units = siamese_config['NUMBER_DENSE_UNITS'] #100
	CONFIG.activation_function = siamese_config['ACTIVATION_FUNCTION'] #relu
	CONFIG.rate_drop_dense = siamese_config['RATE_DROP_DENSE'] #0.2
	CONFIG.validation_split_ratio = siamese_config['VALIDATION_SPLIT'] #0.15

	siamese = SiameseBiLSTM(CONFIG.embedding_dim , CONFIG.max_sequence_length, CONFIG.number_lstm_units , CONFIG.number_dense_units, CONFIG.rate_drop_lstm, CONFIG.rate_drop_dense, CONFIG.activation_function, CONFIG.validation_split_ratio)
	best_model_path = siamese.train_model(sentences_pair, is_similar, embedding_meta_data, model_save_directory='./')
	return best_model_path
Exemplo n.º 6
0
def Update_Train(languege,question,answer):
    if (languege == 0):
        question_pre = load_data(pre_path_files + 'Persian/question_pre')
        answering_pre = load_data(pre_path_files + 'Persian/answering_pre')
        is_similar = load_data(pre_path_files + 'Persian/is_similar')
        tokenizer = load_data(pre_path_files + 'Persian/tokenizer')
        embedding_matrix = load_data(pre_path_files + 'Persian/embedding_matrix')
        embedding_meta_data = {
            'tokenizer': tokenizer,
            'embedding_matrix': embedding_matrix
        }
        sentences_pair = [(x1, x2) for x1, x2 in zip(question_pre, answering_pre)]
        siamese = SiameseBiLSTM(CONFIG.embedding_dim, CONFIG.max_sequence_length, CONFIG.number_lstm_units,
                                CONFIG.number_dense_units, CONFIG.rate_drop_lstm, CONFIG.rate_drop_dense,
                                CONFIG.activation_function, CONFIG.validation_split_ratio)
        best_model_path = siamese.train_model(sentences_pair, is_similar, embedding_meta_data,
                                              model_save_directory=pre_path_files + 'Persian/')
    elif (languege == 1):
        question_pre = load_data(pre_path_files + 'English/question_pre')
        answering_pre = load_data(pre_path_files + 'English/answering_pre')
        is_similar = load_data(pre_path_files + 'English/is_similar')
        tokenizer = load_data(pre_path_files + 'English/tokenizer')
        embedding_matrix = load_data(pre_path_files + 'English/embedding_matrix')
        embedding_meta_data = {
            'tokenizer': tokenizer,
            'embedding_matrix': embedding_matrix
        }
        sentences_pair = [(x1, x2) for x1, x2 in zip(question_pre, answering_pre)]
        siamese = SiameseBiLSTM(CONFIG.embedding_dim, CONFIG.max_sequence_length, CONFIG.number_lstm_units,
                                CONFIG.number_dense_units, CONFIG.rate_drop_lstm, CONFIG.rate_drop_dense,
                                CONFIG.activation_function, CONFIG.validation_split_ratio)
        best_model_path = siamese.train_model(sentences_pair, is_similar, embedding_meta_data,
                                              model_save_directory=pre_path_files + 'English/')
Exemplo n.º 7
0

class Configuration(object):
    """Dump stuff here"""

CONFIG = Configuration()
CONFIG.embedding_dim = siamese_config['EMBEDDING_DIM']
CONFIG.max_sequence_length = siamese_config['MAX_SEQUENCE_LENGTH']
CONFIG.number_lstm_units = siamese_config['NUMBER_LSTM']
CONFIG.rate_drop_lstm = siamese_config['RATE_DROP_LSTM']
CONFIG.number_dense_units = siamese_config['NUMBER_DENSE_UNITS']
CONFIG.activation_function = siamese_config['ACTIVATION_FUNCTION']
CONFIG.rate_drop_dense = siamese_config['RATE_DROP_DENSE']
CONFIG.validation_split_ratio = siamese_config['VALIDATION_SPLIT']

siamese = SiameseBiLSTM(CONFIG.embedding_dim , CONFIG.max_sequence_length, CONFIG.number_lstm_units , CONFIG.number_dense_units, 
					    CONFIG.rate_drop_lstm, CONFIG.rate_drop_dense, CONFIG.activation_function, CONFIG.validation_split_ratio)

best_model_path = siamese.train_model(sentences_pair, is_similar, embedding_meta_data, model_save_directory='./')


########################
###### Testing #########
########################
def test(test_pair):
	model = load_model(best_model_path)

	test_sentence_pairs = test_pair
	bias1=random.uniform(0.28123,0.3156)
	bias2=random.uniform(0.07682,0.09354)
	test_data_x1, test_data_x2, leaks_test = create_test_data(tokenizer,test_sentence_pairs,  siamese_config['MAX_SEQUENCE_LENGTH'])