예제 #1
0
def run():
    LOG.info('Start loading datasets')
    labels = datasets.labels('../resources/labels.csv')
    labels_count = len(labels)
    train_dataset = datasets.load_dataset('../resources/train.csv', labels_count)
    test_dataset = datasets.load_dataset('../resources/test.csv', labels_count)
    LOG.info('Done loading datasets')

    vocabulary_size = 500
    embedding_dimensions = 50

    LOG.info('Start training vectorizer')
    vectorizer = models.train_vectorizer(train_dataset, vocabulary_size)
    LOG.info('Done training vectorizer')

    LOG.info('Start building model')
    classification_model = models.classification_model(
        vocabulary_size,
        embedding_dimensions,
        labels_count
    )
    classification_model.summary(print_fn=LOG.info)
    LOG.info('Done building model')

    LOG.info('Start training model')
    epochs = 36
    batch_size = 64
    vectorized_train_dataset = datasets.vectorize(vectorizer, vocabulary_size, train_dataset)
    vectorized_test_dataset = datasets.vectorize(vectorizer, vocabulary_size, test_dataset)
    models.train_classification_model(
        classification_model,
        batch_size,
        epochs,
        vectorized_train_dataset,
        vectorized_test_dataset,
    )
    LOG.info('Done training model')

    LOG.info('Start evaluating model')
    model_metrics, confusion_matrix, classification_report = models.evaluate_classification_model(
        classification_model,
        vectorized_test_dataset,
        labels
    )
    models.save_evaluation_results(
        model_metrics,
        confusion_matrix,
        classification_report,
        list(labels.values())
    )
    LOG.info('Done evaluating model')

    LOG.info('Start persisting model')
    classification_model.save('../resources/models/classification_model.h5')
    pickle.dump(vectorizer, open('../resources/models/word_vectorizer.pickle', 'wb'))
    LOG.info('Done persisting model')
예제 #2
0
        for w_ in range(layer_w.shape[0]):
            answer = convolution(dataset, np.moveaxis(layer_w[w_], 1, 0))
            answer[-length:] = answer[-length - 1]
            f.write('>' + str(length) + '_' + str(w_) + '\n')
            f.write(' '.join(map(str, answer)))
            f.write('\n')
            #plt.figure(figsize=(20,10))
            #plt.plot(np.arange(-1000, 1000), answer, 'b')
            #plt.ylabel('Frequency')
            #plt.xlabel('Position')
            #plt.savefig(path+str(w_+1)+'_'+marker, dpi=500)
            #plt.close()

    f.close()


model_regression = models.regression_model((512, 4))
model_classification = models.classification_model((512, 4))

model_regression.load_weights('./models/regression.h5')
model_classification.load_weights('./models/classification.h5')

#seqlogo('regression', model_regression.get_weights()[:4])
#seqlogo('classification', model_classification.get_weights()[:4])

more_then_one('./data/train_sequences.fa', 'regression',
              model_regression.get_weights()[:4], 'freq', 'regression')
more_then_one('./data/train_sequences.fa', 'classification',
              model_classification.get_weights()[:4], 'freq', 'classification')
예제 #3
0
np.save('./data/train_regr_set', train_set)
np.save('./data/train_regr_answers', train_answers)
np.save('./data/test_regr_set', test_set)
np.save('./data/test_regr_answers', test_answers)

t3 = time.time()
# !!! 2. Models train
# !!! 2.1 Classification model

print('Training classification model')

train_set = np.load('./data/train_class_set.npy')
train_answers = np.load('./data/train_class_answers.npy')

model = models.classification_model(train_set[0].shape)
model.fit(train_set,
          train_answers,
          batch_size=64,
          epochs=5,
          verbose=2,
          validation_split=0.1)

test_set = np.load('./data/test_class_set.npy')
test_answers = np.load('./data/test_class_answers.npy')

predictions = model.predict(test_set)
metrics.all_class_metrics(predictions, test_answers)

model.save('./models/class.h5')
예제 #4
0
def train_model(learning_mode,
                model_name,
                save_path,
                pickle_name,
                batch_size,
                spectrogram,
                verbose=1,
                epochs=20,
                optimizer=SGD(lr=0.02, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5),
                generate_noisy_data=True,
                number_of_noisy_samples=3,
                audio_padding=True,
                mfcc_dim=13,
                mfcc_features=[0]):
    '''
    Main training function. Call this function to train a model.

    :params:
        learning_mode - String, type of learning approach you want to take: Speech to text or classification
        model_name - String, Name of the model, used to save models weights
        save_path - String, Name of the folder where models data will be saved
        pickle_name - String, Name of the pickle file used for saving models loss histroy data
        batch_size - Integer
        spectrogram - Boolean, if True data will be generated with spectogram features, otherwise MFCC features will be used
        verbose - Integer, if 1 Keras will show the training process
        epochs - Integer, number of epochs per training session
        optimizer - Keras optimizer object
        generate_noisy_data - Boolean, if True generator will generate noisy data as additional training data
                                       NOTE: This will increase your batch size, if you have 4GB or less RAM don't use this parameter or set batch_size to 32 or less
        number_of_noisy_samples - Integer, number of noisy samples generater PER sample in a batch
        audio_padding -  Boolean, if set to True each sample will be padded with zeros to match sample_rate
        mfcc_dim - Integer, number of MFCC features
        mfcc_features - Integer list, what MFCC features to use Example: [0] will only be using regular MFCC features, [0, 1] -> regular + delta features
    '''

    #Defines data generatorf for the training process
    generator = AudioGenerator(learning_mode=learning_mode,
                               spectrogram=spectrogram,
                               batch_size=batch_size, 
                               mfcc_features=mfcc_features, 
                               mfcc_dim=mfcc_dim,
                               padd_to_sr=audio_padding,
                               generate_noisy_data=generate_noisy_data,
                               number_of_noisy_samples=number_of_noisy_samples)
    
    
    #calculate steps per epoch
    num_train_examples=len(generator.training_files)
    steps_per_epoch = num_train_examples//batch_size

    num_valid_samples = len(generator.validation_paths) 
    validation_steps = num_valid_samples//batch_size
    
    #pretty much hard coded version of features decision
    #TODO: Make this better, because for now it won't handle delta and delta-delta MFCC features
    if spectrogram:
        features = 161
    else:
        features = 13

    if learning_mode == 'speech_to_text':
        model = speech_to_text_model(input_dim=features, 
                                     filters=200, 
                                     kernel_size=11, 
                                     strides=2, 
                                     padding='valid',
                                     rnn_units=200, 
                                     output_dim=29) #number of characters

        #Adds CTC loss for the speech_to_text model
        model = add_ctc_loss(model)
        model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=optimizer)
    
    elif learning_mode == 'classification':
        #if learning_mode is set to classifcation, define the classifcation model with categorical crossentropy loss
        model = classification_model(input_dim=(99, features), 
                                     filters=256, 
                                     kernel_size=1, 
                                     strides=1, 
                                     padding='valid', 
                                     output_dim=len(generator.classes)) #number of classes

        #Adds categorical crossentropy loss for the classification model
        model = add_categorical_loss(model , len(generator.classes))
        #compile the model with choosen loss and optimizer
        model.compile(loss={'categorical_crossentropy': lambda y_true, y_pred: y_pred}, optimizer=optimizer)
        
    #Creates save folder if it doesn't exists
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    
    #Defines checkpointer that is responsible for saving the model after N steps of the training
    checkpointer = ModelCheckpoint(filepath=save_path+model_name, verbose=0)

    #Train the choosen model with the data generator
    hist = model.fit_generator(generator=generator.next_train(),            #Calls generators next_train function which generates new batch of training data
                                steps_per_epoch=steps_per_epoch,            #Defines how many training steps are there 
                                epochs=epochs,                              #Defines how many epochs does a training process takes
                                validation_data=generator.next_valid(),     #Calls generators next_valid function which generates new batch of validation data
                                validation_steps=validation_steps,          #Defines how many validation steps are theere
                                callbacks=[checkpointer],                   #Defines all callbacks (In this case we only have molde checkpointer that saves the model)
                                verbose=verbose)                            #If verbose is 1 we can see the training process 
    
    #Save models training history
    with open(save_path + pickle_name, 'wb') as f:
        pickle.dump(hist.history, f)
예제 #5
0
                else:
                    l1norm = np.linalg.norm(ideal - regression[idx:idx + 26],
                                            ord=1)
                    if l1norm <= thres:
                        results.append([idx * 4 + pos + 400, l1norm])
    return results


path = input('Path to file: ')
thres = int(input('Input threshold for L1Norm: '))  # default 100
output = open('./output.txt', 'w')
output.write('loc pos similarity \n')

# models_load
shape = (512, 4)
model_c = models.classification_model(shape)
model_r = models.regression_model(shape)
model_c.load_weights('./models/classification.h5')
model_r.load_weights('./models/regression.h5')

# dataset_make
locs_names, seqs_set = dataset(path)

# analysis
for loc in locs_names:
    # read loc
    start, end, seq, seq_len, pos, results = loc[1], loc[2], '', 0, 0, []
    for row in range(start, end):
        if row % 10000 == 0:
            print('loc', loc[0], 'position', pos)
        seq += seqs_set[row]