def __init__(self, config, cases): self.idx = 0 self.config = config self.phone_to_id = utils.load_phone_mapping(config) metadata = test_metadata(config, cases) db = metadata.gen_pickle() self.build_dataset(db)
def __init__(self, type_, config_file): if type_ == 'train': super().__init__(config_file, config_file['train']['batch_size']) else: super().__init__(config_file, config_file['train']['batch_size']) from metadata import timit_metadata metadata = timit_metadata(type_.upper(), config_file) # Returns huge list of feature vectors of audio recordings and phone sequences as tuples list_of_sent = metadata.gen_pickle() phone_to_id = utils.load_phone_mapping(config_file) self.build_dataset(list_of_sent, phone_to_id)
def __init__(self, config, min_phones, recordings_dump_path): """ :param config: config files :param min_phones: minimum number of instances of each phone to calculate Q value :param recordings_dump_path: path to dump the feature vectors of the recordings to be considered :param model_out_path: path to final q value dump """ self.config = config self.pkl_name = recordings_dump_path self.min_phones = min_phones self.idx = 0 self.win_len, self.win_step = config['window_size'], config['window_step'] # Initialise model self.rnn = dl_model('infer') # Load mapping of phone to id self.phone_to_id = utils.load_phone_mapping(config)
def find_batch_q(dump_path, prob_path, dec_type, top_n, exp_factor, rnn_model=None, min_phones=200, min_sub_len=4, max_sub_len=15): """ Computes the q-vale for each phone averaged over a specified number of instances :param rnn_model: dl_model object handle :param max_sub_len: max length of random subsequence chosen from gr_phone for q-value calculation :param min_sub_len: min length of random subsequence chosen from gr_phone for q-value calculation :param prob_path: path to probability file :param dump_path: path to dump file :param min_phones: minimum number of phones to be covered :param dec_type: max or CTC :param top_n: top_n sequences to be considered :param exp_factor: weight assigned to probability score :return: a dictionary of q-value for each phone and probabilities for insertion, deletion, substitution """ if os.path.exists(dump_path): with open(dump_path, 'rb') as f: vals = pickle.load(f) print('Loaded Q values from dump:', vals[0]) return vals config = read_yaml() phone_to_id = utils.load_phone_mapping(config) blank_token_id = phone_to_id['BLANK'] if rnn_model is None: rnn_model = dl_model('infer') if not os.path.exists(config['dir']['pickle']): os.mkdir(config['dir']['pickle']) database_name = os.path.join(config['dir']['pickle'], rnn_model.arch_name, 'QValGenModel_in_' + str(min_phones) + '.pkl') model_out_name = os.path.join(config['dir']['pickle'], rnn_model.arch_name, 'QValGenModel_out_' + str(min_phones) + '.pkl') # Instantiates the model to calculate predictions dataloader = qval_dataloader(config, min_phones, database_name) model = qval_model(config, model_out_name, dataloader, rnn_model) db = model.get_outputs() # load probabilities vectors with open(prob_path, 'rb') as f: insert_prob, delete_prob, replace_prob = pickle.load(f) div = config['prob_thesh_const'] temp = np.where(replace_prob == 0, 1, replace_prob) minimum = np.min(np.min(temp)) print("Minimum substitution prob:", minimum) replace_prob = np.where(replace_prob == 0, minimum / div, replace_prob) temp = np.where(insert_prob == 0, 1, insert_prob) minimum = np.min(temp) print("Minimum insertion prob:", minimum) insert_prob = np.where(insert_prob == 0, minimum / div, insert_prob) temp = np.where(delete_prob == 0, 1, delete_prob) minimum = np.min(temp) print("Minimum deletion prob:", minimum) delete_prob = np.where(delete_prob == 0, minimum / div, delete_prob) final_dict = {} insert_prob_pow, delete_prob_pow, replace_prob_pow = np.power(insert_prob, exp_factor), \ np.power(delete_prob, exp_factor), \ np.power(replace_prob, exp_factor) print("Probabilities:\nInsert:", insert_prob, '\nDelete:', delete_prob, '\nSubsti:', replace_prob) # for each sentence in database, find best subsequence, align and calculate q values for i, (output, length, gr_phone, label_lens) in enumerate(db): if i % (len(db)//10) == 0: print("On output:", str(i) + "/" + str(len(db))) cur_out = output[:length] gr_phone_ids = np.array(gr_phone[:label_lens]) random_subsequence_len = min(np.random.randint(min_sub_len, max_sub_len), len(gr_phone_ids)-1) sub_start = np.random.randint(0, len(gr_phone_ids) - random_subsequence_len) random_subsequence = gr_phone_ids[sub_start:sub_start + random_subsequence_len] # Generate lattice from current predictions lattices = generate_lattice(cur_out, blank_token_id, dec_type, top_n) # Find best subsequence in lattice res_substring, _ = traverse_best_lattice(lattices, dec_type, random_subsequence, insert_prob, delete_prob, replace_prob) # Calculate q values by comparing template and best match q_vals = find_q_values(random_subsequence, res_substring[0], res_substring[1], insert_prob_pow, delete_prob_pow, replace_prob_pow) # Add them up for ph_id, list_of_qvals in q_vals.items(): if ph_id not in final_dict.keys(): final_dict[ph_id] = [] final_dict[ph_id] += list_of_qvals # Average out the values final_dict = {k: (sum(v) / len(v), len(v)) for k, v in final_dict.items()} with open(dump_path, 'wb') as f: pickle.dump((final_dict, insert_prob, delete_prob, replace_prob), f) print("Q values:", final_dict) return final_dict, insert_prob, delete_prob, replace_prob
def __init__(self, config_file, min_phones, recordings_dump_path): super().__init__(config_file) metadata = qval_metadata(config_file, min_phones, recordings_dump_path) ptoid = utils.load_phone_mapping(config_file) self.build_dataset(metadata.gen_pickle(), ptoid, bound_lengths=False)
def __init__(self, config, to_do): super(liGRU, self).__init__(config) self.phone_to_id = utils.load_phone_mapping(config) self.rnn_name = config['rnn'] # Store important parameters self.input_dim = config['n_mfcc'] + config['n_fbank'] self.hidden_dim, self.num_phones, self.num_layers = config[ 'hidden_dim'], config['num_phones'], config['num_layers'] self.output_dim = self.num_phones + 2 # 1 for pad and 1 for blank self.blank_token_id = self.phone_to_id['BLANK'] self.ligru_act = nn.LeakyReLU(0.2) self.is_bidirectional = config['bidirectional'] self.ligru_orthinit = True self.dropout_vals = [0.2] * self.num_layers self.hidden_dim_layers = [self.hidden_dim] * self.num_layers self.use_cuda = torch.cuda.is_available() and config['use_cuda'] self.use_batchnorm = True self.use_batchnorm_inp = True # List initialization self.wh = nn.ModuleList([]) self.uh = nn.ModuleList([]) self.wz = nn.ModuleList([]) # Update Gate self.uz = nn.ModuleList([]) # Update Gate self.act = nn.ModuleList([]) # Activations # Batch Norm if self.use_batchnorm: self.bn_wh = nn.ModuleList([]) self.bn_wz = nn.ModuleList([]) # Input batch normalization if self.use_batchnorm_inp: if self.is_bidirectional: self.bn0 = nn.BatchNorm1d(2 * self.input_dim, momentum=0.05) else: self.bn0 = nn.BatchNorm1d(self.input_dim, momentum=0.05) current_input = self.input_dim # Initialization of hidden layers for i in range(self.num_layers): # Activations self.act.append(self.ligru_act) add_bias = True if self.use_batchnorm: add_bias = False # Feed-forward connections if i == 0 and self.is_bidirectional: self.wh.append( nn.Linear(2 * current_input, self.hidden_dim_layers[i], bias=add_bias)) self.wz.append( nn.Linear(2 * current_input, self.hidden_dim_layers[i], bias=add_bias)) else: self.wh.append( nn.Linear(current_input, self.hidden_dim_layers[i], bias=add_bias)) self.wz.append( nn.Linear(current_input, self.hidden_dim_layers[i], bias=add_bias)) # Recurrent connections self.uh.append( nn.Linear(self.hidden_dim_layers[i], self.hidden_dim_layers[i], bias=False)) self.uz.append( nn.Linear(self.hidden_dim_layers[i], self.hidden_dim_layers[i], bias=False)) if self.ligru_orthinit: nn.init.orthogonal_(self.uh[i].weight) nn.init.orthogonal_(self.uz[i].weight) if self.use_batchnorm: # batch norm initialization self.bn_wh.append( nn.BatchNorm1d(self.hidden_dim_layers[i], momentum=0.05)) self.bn_wz.append( nn.BatchNorm1d(self.hidden_dim_layers[i], momentum=0.05)) if self.is_bidirectional: current_input = 2 * self.hidden_dim_layers[i] else: current_input = self.hidden_dim_layers[i] self.out_dim = self.hidden_dim_layers[ -1] + self.is_bidirectional * self.hidden_dim_layers[-1] if self.is_bidirectional: self.hidden_2_phone = nn.Linear(2 * self.hidden_dim, self.output_dim) else: self.hidden_2_phone = nn.Linear(self.hidden_dim, self.output_dim) self.loss_func = torch.nn.CTCLoss(blank=self.blank_token_id, reduction='mean', zero_infinity=False) print("Using CTC loss") optimizer = config['train']['optim'] if optimizer == 'SGD': self.optimizer = optim.SGD(self.parameters(), lr=config['train']['lr'], momentum=0.9) elif optimizer == 'Adam': self.optimizer = optim.Adam(self.parameters(), lr=config['train']['lr'])
def __init__(self, config, mode): super(RNN, self).__init__(config) self.rnn_name = config['rnn'] # Store important parameters self.feat_dim = config['n_mfcc'] + config['n_fbank'] self.hidden_dim, self.num_phones, self.num_layers = config[ 'hidden_dim'], config['num_phones'], config['num_layers'] self.output_dim = self.num_phones + 2 # 1 for pad and 1 for blank self.phone_to_id = utils.load_phone_mapping(config) self.blank_token_id = self.phone_to_id['BLANK'] if config['bidirectional']: if self.rnn_name == 'LSTM': self.rnn = nn.LSTM(input_size=self.feat_dim, hidden_size=self.hidden_dim, num_layers=self.num_layers, dropout=config['dropout'], bidirectional=True, batch_first=True) else: self.rnn = nn.GRU(input_size=self.feat_dim, hidden_size=self.hidden_dim, num_layers=self.num_layers, dropout=config['dropout'], bidirectional=True, batch_first=True) # In linear network, *2 for bidirectional self.hidden2phone = nn.Linear(self.hidden_dim * 2, self.output_dim) else: if self.rnn_name == 'LSTM': self.rnn = nn.LSTM(input_size=self.feat_dim, hidden_size=self.hidden_dim, num_layers=self.num_layers, dropout=config['dropout'], bidirectional=False, batch_first=True) else: self.rnn = nn.GRU(input_size=self.feat_dim, hidden_size=self.hidden_dim, num_layers=self.num_layers, dropout=config['dropout'], bidirectional=True, batch_first=True) # In linear network, *2 for bidirectional self.hidden2phone = nn.Linear(self.hidden_dim, self.output_dim) # for pad token self.loss_func = torch.nn.CTCLoss(blank=self.blank_token_id, reduction='mean', zero_infinity=False) print("Using CTC loss") optimizer = config['train']['optim'] if optimizer == 'SGD': self.optimizer = optim.SGD(self.parameters(), lr=config['train']['lr'], momentum=0.9) elif optimizer == 'Adam': self.optimizer = optim.Adam(self.parameters(), lr=config['train']['lr'])
def __init__(self, config, mode): super(customRNN, self).__init__(config) self.rnn_name = config['rnn'] # Store important parameters self.phone_to_id = utils.load_phone_mapping(config) self.feat_dim = config['n_mfcc'] + config['n_fbank'] self.hidden_dim, self.num_phones, = config['hidden_dim'], config[ 'num_phones'] self.num_layers = config['num_layers'] self.output_dim = self.num_phones + 2 # 1 for pad and 1 for blank self.blank_token_id = self.phone_to_id['BLANK'] self.num_directions = 2 if config['bidirectional'] else 1 dropout, r_dropout = config['dropout'], config['r_dropout'] layer_norm, batch_norm = config['layerNorm'], config['batchnorm'] if config['bidirectional']: if self.rnn_name == 'customLSTM': self.rnn = custom_rnn.LayerNormLSTM(self.feat_dim, self.hidden_dim, self.num_layers, 0.3, 0.3, bidirectional=True, layer_norm_enabled=True) elif self.rnn_name == 'customGRU': self.rnn = custom_rnn.customGRU(self.feat_dim, self.hidden_dim, self.num_layers, dropout=dropout, layer_norm_enabled=layer_norm, r_dropout=r_dropout, bidirectional=True) elif self.rnn_name == 'customliGRU': self.rnn = custom_rnn.customliGRU(self.feat_dim, self.hidden_dim, self.num_layers, dropout=dropout, bidirectional=True) # In linear network, *2 for bidirectional self.hidden2phone = nn.Linear(self.hidden_dim * 2, self.output_dim) else: if self.rnn_name == 'customLSTM': self.rnn = custom_rnn.LayerNormLSTM(self.feat_dim, self.hidden_dim, self.num_layers, 0.2, 0.2, bidirectional=False, layer_norm_enabled=True) elif self.rnn_name == 'customGRU': self.rnn = custom_rnn.customGRU(self.feat_dim, self.hidden_dim, self.num_layers, dropout=dropout, layer_norm_enabled=layer_norm, r_dropout=r_dropout, bidirectional=True) elif self.rnn_name == 'customliGRU': self.rnn = custom_rnn.customliGRU(self.feat_dim, self.hidden_dim, self.num_layers, dropout=dropout, bidirectional=False) # In linear network, *2 for bidirectional self.hidden2phone = nn.Linear(self.hidden_dim, self.output_dim) # for pad token self.loss_func = torch.nn.CTCLoss(blank=self.blank_token_id, reduction='mean', zero_infinity=False) print("Using CTC loss") optimizer = config['train']['optim'] if optimizer == 'SGD': self.optimizer = optim.SGD(self.parameters(), lr=config['train']['lr'], momentum=0.9) elif optimizer == 'Adam': self.optimizer = optim.Adam(self.parameters(), lr=config['train']['lr'])
def batch_test(config, dec_type, top_n, num_templates, num_compares, num_none, results_dump_path, exp_factor=1): """ Master function which carries out actual testing :param dec_type: max or CTC :param top_n: top-n sequences are considered :param num_templates: number of templates for each keyword :param num_compares: number of clips in which each keyword needs to be searched for :param num_none: number of clips in which none of the keywords is present :param pr_dump_name: dump precision recall values :param results_dump_path: dump comparison results so that c values can be tweaked easily :param wrong_pred_path: path to folder where txt files are stored :param exp_factor: weight assigned to probability score """ keywords = [ 'academic', 'reflect', 'equipment', 'program', 'rarely', 'national', 'social', 'movies', 'greasy', 'water' ] # keywords = [ # 'oily', 'people', 'before', 'living', 'potatoes', 'children', 'overalls', 'morning', 'enough', 'system', # 'water', 'greasy', 'suit', 'dark', 'very', 'without', 'money', 'reflect', 'program', # 'national', 'social', 'water', 'carry', 'time', 'before', 'always', 'often', 'people', 'money', # 'potatoes', 'children'] # keywords = ['oily', 'people', 'before', 'living', 'water', 'children'] # keywords = ['like', 'carry', 'will', 'potatoes', 'before', 'government', 'economic', 'overalls', 'through', 'money', # 'children'] test_case_name = 'test_cases_' + str(num_templates) + '_' + str( num_compares) + '_' + str(num_none) + '.pkl' pkl_name = os.path.join(config['dir']['pickle'], rnn_model.arch_name, test_case_name) results_dump_path = os.path.join(config['dir']['pickle'], rnn_model.arch_name, results_dump_path) # generate cases to be tested on cases = gen_cases(['../datasets/TIMIT/TEST/', '../datasets/TIMIT/TRAIN/'], '../datasets/TIMIT/TEST/', pkl_name, num_templates, num_compares, num_none, keywords, config['gen_template']) infer_mode = config['infer_mode'] if os.path.exists(results_dump_path): with open(results_dump_path, 'rb') as f: return pickle.load(f) else: a = test_model(config, cases) # Q values and probabilities are loaded. Important to load probability values from HERE since # they influence thresholds and Q-values qval_pth = os.path.join(config['dir']['pickle'], rnn_model.arch_name, 'final_q_vals.pkl') prob_pth = os.path.join(config['dir']['pickle'], rnn_model.arch_name, 'probs.pkl') (thresholds, insert_prob, delete_prob, replace_prob) = find_batch_q(qval_pth, prob_pth, dec_type, top_n, exp_factor, rnn_model=rnn_model) # dictionary for storing c values required to declare keyword final_results = {} for kw in cases.keys(): final_results[kw] = {} # initialise model db = a.get_outputs() phone_to_id = utils.load_phone_mapping(config) id_to_phone = {v: k for k, v in phone_to_id.items()} # iterate over every clip and compare it with every template one-by-one # note that gr_phone_entire_clip is NOT USED for i, (output, length, gr_phone_entire_clip, word_in_clip, wav_path) in enumerate(db): if i % (len(db) // 10) == 0: print("On output:", str(i) + "/" + str(len(db))) cur_out = output[:length] # generate lattice from current predictions lattices = generate_lattice(cur_out, rnn_model.model.blank_token_id, dec_type, top_n) # compare with every template for template_word, templates in cases.items(): # if no keyword, then continue if template_word == 'NONE': continue templates = templates['templates'] final_results[template_word][i] = {'data': [], 'metadata': []} for template_phones in templates: # template phone sequence template_phone_ids = [ phone_to_id[x] for x in template_phones ] (pred_phones, node_prob), final_lattice = traverse_best_lattice( lattices, dec_type, template_phone_ids, insert_prob, delete_prob, replace_prob) # out_for_cnn[word_in_clip].append((pred_phones, node_prob, word_in_clip == template_word)) # node probabilities of best lattice substring_phones = [id_to_phone[x] for x in pred_phones] final_lattice = [id_to_phone[x[0]] for x in final_lattice] insert_prob_pow, delete_prob_pow, replace_prob_pow = np.power(insert_prob, exp_factor), \ np.power(delete_prob, exp_factor), \ np.power(replace_prob, exp_factor) # calculate q values q_vals = find_q_values(template_phone_ids, pred_phones, node_prob, insert_prob_pow, delete_prob_pow, replace_prob_pow) metadata = (wav_path, word_in_clip, template_word, gr_phone_entire_clip, final_lattice, substring_phones, template_phones) final_results[template_word][i]['metadata'].append( metadata) if infer_mode == 'group': # sum up the predicted q values predicted_log_val, gr_log_val = 0, 0 for pred_phone, vals in q_vals.items(): for val in vals: predicted_log_val += np.log(val) gr_log_val += (np.log(thresholds[pred_phone][0]) * len(vals)) if template_word == word_in_clip: # gr_log_val should be < predicted_log_val + c final_results[template_word][i]['data'].append( ('right', gr_log_val, predicted_log_val)) else: # gr_log_val should be > predicted_log_val + c final_results[template_word][i]['data'].append( ('wrong', gr_log_val, predicted_log_val)) elif infer_mode == 'indi': above = 0 total_phones = 0 for pred_phone, vals in q_vals.items(): total_phones += len(vals) for val in vals: if val >= thresholds[pred_phone][0]: above += 1 if template_word == word_in_clip: # gr_log_val should be < predicted_log_val + c final_results[template_word][i]['data'].append( ('right', above / total_phones)) # print('YES', above / total_phones) else: # gr_log_val should be > predicted_log_val + c # print('NO', above / total_phones) final_results[template_word][i]['data'].append( ('wrong', above / total_phones)) else: print("Infer Mode not defined") exit(0) with open(results_dump_path, 'wb') as f: pickle.dump(final_results, f) print("Dumped final results of testing") return final_results