def run(): LOG.info('Start loading datasets') labels = datasets.labels('../resources/labels.csv') labels_count = len(labels) train_dataset = datasets.load_dataset('../resources/train.csv', labels_count) test_dataset = datasets.load_dataset('../resources/test.csv', labels_count) LOG.info('Done loading datasets') vocabulary_size = 500 embedding_dimensions = 50 LOG.info('Start training vectorizer') vectorizer = models.train_vectorizer(train_dataset, vocabulary_size) LOG.info('Done training vectorizer') LOG.info('Start building model') classification_model = models.classification_model( vocabulary_size, embedding_dimensions, labels_count ) classification_model.summary(print_fn=LOG.info) LOG.info('Done building model') LOG.info('Start training model') epochs = 36 batch_size = 64 vectorized_train_dataset = datasets.vectorize(vectorizer, vocabulary_size, train_dataset) vectorized_test_dataset = datasets.vectorize(vectorizer, vocabulary_size, test_dataset) models.train_classification_model( classification_model, batch_size, epochs, vectorized_train_dataset, vectorized_test_dataset, ) LOG.info('Done training model') LOG.info('Start evaluating model') model_metrics, confusion_matrix, classification_report = models.evaluate_classification_model( classification_model, vectorized_test_dataset, labels ) models.save_evaluation_results( model_metrics, confusion_matrix, classification_report, list(labels.values()) ) LOG.info('Done evaluating model') LOG.info('Start persisting model') classification_model.save('../resources/models/classification_model.h5') pickle.dump(vectorizer, open('../resources/models/word_vectorizer.pickle', 'wb')) LOG.info('Done persisting model')
for w_ in range(layer_w.shape[0]): answer = convolution(dataset, np.moveaxis(layer_w[w_], 1, 0)) answer[-length:] = answer[-length - 1] f.write('>' + str(length) + '_' + str(w_) + '\n') f.write(' '.join(map(str, answer))) f.write('\n') #plt.figure(figsize=(20,10)) #plt.plot(np.arange(-1000, 1000), answer, 'b') #plt.ylabel('Frequency') #plt.xlabel('Position') #plt.savefig(path+str(w_+1)+'_'+marker, dpi=500) #plt.close() f.close() model_regression = models.regression_model((512, 4)) model_classification = models.classification_model((512, 4)) model_regression.load_weights('./models/regression.h5') model_classification.load_weights('./models/classification.h5') #seqlogo('regression', model_regression.get_weights()[:4]) #seqlogo('classification', model_classification.get_weights()[:4]) more_then_one('./data/train_sequences.fa', 'regression', model_regression.get_weights()[:4], 'freq', 'regression') more_then_one('./data/train_sequences.fa', 'classification', model_classification.get_weights()[:4], 'freq', 'classification')
np.save('./data/train_regr_set', train_set) np.save('./data/train_regr_answers', train_answers) np.save('./data/test_regr_set', test_set) np.save('./data/test_regr_answers', test_answers) t3 = time.time() # !!! 2. Models train # !!! 2.1 Classification model print('Training classification model') train_set = np.load('./data/train_class_set.npy') train_answers = np.load('./data/train_class_answers.npy') model = models.classification_model(train_set[0].shape) model.fit(train_set, train_answers, batch_size=64, epochs=5, verbose=2, validation_split=0.1) test_set = np.load('./data/test_class_set.npy') test_answers = np.load('./data/test_class_answers.npy') predictions = model.predict(test_set) metrics.all_class_metrics(predictions, test_answers) model.save('./models/class.h5')
def train_model(learning_mode, model_name, save_path, pickle_name, batch_size, spectrogram, verbose=1, epochs=20, optimizer=SGD(lr=0.02, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5), generate_noisy_data=True, number_of_noisy_samples=3, audio_padding=True, mfcc_dim=13, mfcc_features=[0]): ''' Main training function. Call this function to train a model. :params: learning_mode - String, type of learning approach you want to take: Speech to text or classification model_name - String, Name of the model, used to save models weights save_path - String, Name of the folder where models data will be saved pickle_name - String, Name of the pickle file used for saving models loss histroy data batch_size - Integer spectrogram - Boolean, if True data will be generated with spectogram features, otherwise MFCC features will be used verbose - Integer, if 1 Keras will show the training process epochs - Integer, number of epochs per training session optimizer - Keras optimizer object generate_noisy_data - Boolean, if True generator will generate noisy data as additional training data NOTE: This will increase your batch size, if you have 4GB or less RAM don't use this parameter or set batch_size to 32 or less number_of_noisy_samples - Integer, number of noisy samples generater PER sample in a batch audio_padding - Boolean, if set to True each sample will be padded with zeros to match sample_rate mfcc_dim - Integer, number of MFCC features mfcc_features - Integer list, what MFCC features to use Example: [0] will only be using regular MFCC features, [0, 1] -> regular + delta features ''' #Defines data generatorf for the training process generator = AudioGenerator(learning_mode=learning_mode, spectrogram=spectrogram, batch_size=batch_size, mfcc_features=mfcc_features, mfcc_dim=mfcc_dim, padd_to_sr=audio_padding, generate_noisy_data=generate_noisy_data, number_of_noisy_samples=number_of_noisy_samples) #calculate steps per epoch num_train_examples=len(generator.training_files) steps_per_epoch = num_train_examples//batch_size num_valid_samples = len(generator.validation_paths) validation_steps = num_valid_samples//batch_size #pretty much hard coded version of features decision #TODO: Make this better, because for now it won't handle delta and delta-delta MFCC features if spectrogram: features = 161 else: features = 13 if learning_mode == 'speech_to_text': model = speech_to_text_model(input_dim=features, filters=200, kernel_size=11, strides=2, padding='valid', rnn_units=200, output_dim=29) #number of characters #Adds CTC loss for the speech_to_text model model = add_ctc_loss(model) model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=optimizer) elif learning_mode == 'classification': #if learning_mode is set to classifcation, define the classifcation model with categorical crossentropy loss model = classification_model(input_dim=(99, features), filters=256, kernel_size=1, strides=1, padding='valid', output_dim=len(generator.classes)) #number of classes #Adds categorical crossentropy loss for the classification model model = add_categorical_loss(model , len(generator.classes)) #compile the model with choosen loss and optimizer model.compile(loss={'categorical_crossentropy': lambda y_true, y_pred: y_pred}, optimizer=optimizer) #Creates save folder if it doesn't exists if not os.path.exists(save_path): os.makedirs(save_path) #Defines checkpointer that is responsible for saving the model after N steps of the training checkpointer = ModelCheckpoint(filepath=save_path+model_name, verbose=0) #Train the choosen model with the data generator hist = model.fit_generator(generator=generator.next_train(), #Calls generators next_train function which generates new batch of training data steps_per_epoch=steps_per_epoch, #Defines how many training steps are there epochs=epochs, #Defines how many epochs does a training process takes validation_data=generator.next_valid(), #Calls generators next_valid function which generates new batch of validation data validation_steps=validation_steps, #Defines how many validation steps are theere callbacks=[checkpointer], #Defines all callbacks (In this case we only have molde checkpointer that saves the model) verbose=verbose) #If verbose is 1 we can see the training process #Save models training history with open(save_path + pickle_name, 'wb') as f: pickle.dump(hist.history, f)
else: l1norm = np.linalg.norm(ideal - regression[idx:idx + 26], ord=1) if l1norm <= thres: results.append([idx * 4 + pos + 400, l1norm]) return results path = input('Path to file: ') thres = int(input('Input threshold for L1Norm: ')) # default 100 output = open('./output.txt', 'w') output.write('loc pos similarity \n') # models_load shape = (512, 4) model_c = models.classification_model(shape) model_r = models.regression_model(shape) model_c.load_weights('./models/classification.h5') model_r.load_weights('./models/regression.h5') # dataset_make locs_names, seqs_set = dataset(path) # analysis for loc in locs_names: # read loc start, end, seq, seq_len, pos, results = loc[1], loc[2], '', 0, 0, [] for row in range(start, end): if row % 10000 == 0: print('loc', loc[0], 'position', pos) seq += seqs_set[row]