Exemplo n.º 1
0
    def __init__(self, config, cases):

        self.idx = 0
        self.config = config

        self.phone_to_id = utils.load_phone_mapping(config)

        metadata = test_metadata(config, cases)
        db = metadata.gen_pickle()
        self.build_dataset(db)
Exemplo n.º 2
0
    def __init__(self, type_, config_file):
        if type_ == 'train':
            super().__init__(config_file, config_file['train']['batch_size'])
        else:
            super().__init__(config_file, config_file['train']['batch_size'])

        from metadata import timit_metadata
        metadata = timit_metadata(type_.upper(), config_file)
        # Returns huge list of feature vectors of audio recordings and phone sequences as tuples
        list_of_sent = metadata.gen_pickle()
        phone_to_id = utils.load_phone_mapping(config_file)

        self.build_dataset(list_of_sent, phone_to_id)
Exemplo n.º 3
0
    def __init__(self, config, min_phones, recordings_dump_path):
        """
        :param config: config files
        :param min_phones: minimum number of instances of each phone to calculate Q value
        :param recordings_dump_path: path to dump the feature vectors of the recordings to be considered
        :param model_out_path: path to final q value dump
        """

        self.config = config
        self.pkl_name = recordings_dump_path
        self.min_phones = min_phones
        self.idx = 0
        self.win_len, self.win_step = config['window_size'], config['window_step']
        # Initialise model
        self.rnn = dl_model('infer')

        # Load mapping of phone to id
        self.phone_to_id = utils.load_phone_mapping(config)
Exemplo n.º 4
0
def find_batch_q(dump_path, prob_path, dec_type, top_n, exp_factor, rnn_model=None,
                 min_phones=200, min_sub_len=4, max_sub_len=15):
    """
    Computes the q-vale for each phone averaged over a specified number of instances
    :param rnn_model: dl_model object handle
    :param max_sub_len: max length of random subsequence chosen from gr_phone for q-value calculation
    :param min_sub_len: min length of random subsequence chosen from gr_phone for q-value calculation
    :param prob_path: path to probability file
    :param dump_path: path to dump file
    :param min_phones: minimum number of phones to be covered
    :param dec_type: max or CTC
    :param top_n: top_n sequences to be considered
    :param exp_factor: weight assigned to probability score
    :return: a dictionary of q-value for each phone and probabilities for insertion, deletion, substitution
    """

    if os.path.exists(dump_path):
        with open(dump_path, 'rb') as f:
            vals = pickle.load(f)
            print('Loaded Q values from dump:', vals[0])
            return vals

    config = read_yaml()
    phone_to_id = utils.load_phone_mapping(config)
    blank_token_id = phone_to_id['BLANK']

    if rnn_model is None:
        rnn_model = dl_model('infer')

    if not os.path.exists(config['dir']['pickle']):
        os.mkdir(config['dir']['pickle'])

    database_name = os.path.join(config['dir']['pickle'], rnn_model.arch_name, 'QValGenModel_in_' + str(min_phones) + '.pkl')
    model_out_name = os.path.join(config['dir']['pickle'], rnn_model.arch_name, 'QValGenModel_out_' + str(min_phones) + '.pkl')

    # Instantiates the model to calculate predictions
    dataloader = qval_dataloader(config, min_phones, database_name)
    model = qval_model(config, model_out_name, dataloader, rnn_model)

    db = model.get_outputs()

    # load probabilities vectors
    with open(prob_path, 'rb') as f:
        insert_prob, delete_prob, replace_prob = pickle.load(f)
        div = config['prob_thesh_const']

        temp = np.where(replace_prob == 0, 1, replace_prob)
        minimum = np.min(np.min(temp))
        print("Minimum substitution prob:", minimum)
        replace_prob = np.where(replace_prob == 0, minimum / div, replace_prob)

        temp = np.where(insert_prob == 0, 1, insert_prob)
        minimum = np.min(temp)
        print("Minimum insertion prob:", minimum)
        insert_prob = np.where(insert_prob == 0, minimum / div, insert_prob)

        temp = np.where(delete_prob == 0, 1, delete_prob)
        minimum = np.min(temp)
        print("Minimum deletion prob:", minimum)
        delete_prob = np.where(delete_prob == 0, minimum / div, delete_prob)

    final_dict = {}
    insert_prob_pow, delete_prob_pow, replace_prob_pow = np.power(insert_prob, exp_factor), \
                                                         np.power(delete_prob, exp_factor), \
                                                         np.power(replace_prob, exp_factor)

    print("Probabilities:\nInsert:", insert_prob, '\nDelete:', delete_prob, '\nSubsti:', replace_prob)

    # for each sentence in database, find best subsequence, align and calculate q values
    for i, (output, length, gr_phone, label_lens) in enumerate(db):
        if i % (len(db)//10) == 0:
            print("On output:", str(i) + "/" + str(len(db)))
        cur_out = output[:length]
        gr_phone_ids = np.array(gr_phone[:label_lens])
        random_subsequence_len = min(np.random.randint(min_sub_len, max_sub_len), len(gr_phone_ids)-1)
        sub_start = np.random.randint(0, len(gr_phone_ids) - random_subsequence_len)
        random_subsequence = gr_phone_ids[sub_start:sub_start + random_subsequence_len]

        # Generate lattice from current predictions
        lattices = generate_lattice(cur_out, blank_token_id, dec_type, top_n)
        # Find best subsequence in lattice
        res_substring, _ = traverse_best_lattice(lattices, dec_type, random_subsequence,
                                                 insert_prob, delete_prob, replace_prob)
        # Calculate q values by comparing template and best match
        q_vals = find_q_values(random_subsequence, res_substring[0], res_substring[1],
                               insert_prob_pow, delete_prob_pow, replace_prob_pow)

        # Add them up
        for ph_id, list_of_qvals in q_vals.items():
            if ph_id not in final_dict.keys():
                final_dict[ph_id] = []
            final_dict[ph_id] += list_of_qvals

    # Average out the values
    final_dict = {k: (sum(v) / len(v), len(v)) for k, v in final_dict.items()}

    with open(dump_path, 'wb') as f:
        pickle.dump((final_dict, insert_prob, delete_prob, replace_prob), f)

    print("Q values:", final_dict)
    return final_dict, insert_prob, delete_prob, replace_prob
Exemplo n.º 5
0
    def __init__(self, config_file, min_phones, recordings_dump_path):
        super().__init__(config_file)

        metadata = qval_metadata(config_file, min_phones, recordings_dump_path)
        ptoid = utils.load_phone_mapping(config_file)
        self.build_dataset(metadata.gen_pickle(), ptoid, bound_lengths=False)
Exemplo n.º 6
0
    def __init__(self, config, to_do):

        super(liGRU, self).__init__(config)

        self.phone_to_id = utils.load_phone_mapping(config)
        self.rnn_name = config['rnn']
        # Store important parameters
        self.input_dim = config['n_mfcc'] + config['n_fbank']
        self.hidden_dim, self.num_phones, self.num_layers = config[
            'hidden_dim'], config['num_phones'], config['num_layers']
        self.output_dim = self.num_phones + 2  # 1 for pad and 1 for blank
        self.blank_token_id = self.phone_to_id['BLANK']

        self.ligru_act = nn.LeakyReLU(0.2)
        self.is_bidirectional = config['bidirectional']
        self.ligru_orthinit = True
        self.dropout_vals = [0.2] * self.num_layers
        self.hidden_dim_layers = [self.hidden_dim] * self.num_layers
        self.use_cuda = torch.cuda.is_available() and config['use_cuda']

        self.use_batchnorm = True
        self.use_batchnorm_inp = True

        # List initialization
        self.wh = nn.ModuleList([])
        self.uh = nn.ModuleList([])

        self.wz = nn.ModuleList([])  # Update Gate
        self.uz = nn.ModuleList([])  # Update Gate

        self.act = nn.ModuleList([])  # Activations

        # Batch Norm
        if self.use_batchnorm:
            self.bn_wh = nn.ModuleList([])
            self.bn_wz = nn.ModuleList([])

        # Input batch normalization
        if self.use_batchnorm_inp:
            if self.is_bidirectional:
                self.bn0 = nn.BatchNorm1d(2 * self.input_dim, momentum=0.05)
            else:
                self.bn0 = nn.BatchNorm1d(self.input_dim, momentum=0.05)

        current_input = self.input_dim

        # Initialization of hidden layers
        for i in range(self.num_layers):

            # Activations
            self.act.append(self.ligru_act)

            add_bias = True

            if self.use_batchnorm:
                add_bias = False

            # Feed-forward connections
            if i == 0 and self.is_bidirectional:
                self.wh.append(
                    nn.Linear(2 * current_input,
                              self.hidden_dim_layers[i],
                              bias=add_bias))
                self.wz.append(
                    nn.Linear(2 * current_input,
                              self.hidden_dim_layers[i],
                              bias=add_bias))
            else:
                self.wh.append(
                    nn.Linear(current_input,
                              self.hidden_dim_layers[i],
                              bias=add_bias))
                self.wz.append(
                    nn.Linear(current_input,
                              self.hidden_dim_layers[i],
                              bias=add_bias))

            # Recurrent connections
            self.uh.append(
                nn.Linear(self.hidden_dim_layers[i],
                          self.hidden_dim_layers[i],
                          bias=False))
            self.uz.append(
                nn.Linear(self.hidden_dim_layers[i],
                          self.hidden_dim_layers[i],
                          bias=False))

            if self.ligru_orthinit:
                nn.init.orthogonal_(self.uh[i].weight)
                nn.init.orthogonal_(self.uz[i].weight)

            if self.use_batchnorm:
                # batch norm initialization
                self.bn_wh.append(
                    nn.BatchNorm1d(self.hidden_dim_layers[i], momentum=0.05))
                self.bn_wz.append(
                    nn.BatchNorm1d(self.hidden_dim_layers[i], momentum=0.05))

            if self.is_bidirectional:
                current_input = 2 * self.hidden_dim_layers[i]
            else:
                current_input = self.hidden_dim_layers[i]

        self.out_dim = self.hidden_dim_layers[
            -1] + self.is_bidirectional * self.hidden_dim_layers[-1]

        if self.is_bidirectional:
            self.hidden_2_phone = nn.Linear(2 * self.hidden_dim,
                                            self.output_dim)
        else:
            self.hidden_2_phone = nn.Linear(self.hidden_dim, self.output_dim)

        self.loss_func = torch.nn.CTCLoss(blank=self.blank_token_id,
                                          reduction='mean',
                                          zero_infinity=False)
        print("Using CTC loss")

        optimizer = config['train']['optim']
        if optimizer == 'SGD':
            self.optimizer = optim.SGD(self.parameters(),
                                       lr=config['train']['lr'],
                                       momentum=0.9)
        elif optimizer == 'Adam':
            self.optimizer = optim.Adam(self.parameters(),
                                        lr=config['train']['lr'])
Exemplo n.º 7
0
    def __init__(self, config, mode):

        super(RNN, self).__init__(config)

        self.rnn_name = config['rnn']
        # Store important parameters
        self.feat_dim = config['n_mfcc'] + config['n_fbank']
        self.hidden_dim, self.num_phones, self.num_layers = config[
            'hidden_dim'], config['num_phones'], config['num_layers']
        self.output_dim = self.num_phones + 2  # 1 for pad and 1 for blank
        self.phone_to_id = utils.load_phone_mapping(config)
        self.blank_token_id = self.phone_to_id['BLANK']

        if config['bidirectional']:
            if self.rnn_name == 'LSTM':
                self.rnn = nn.LSTM(input_size=self.feat_dim,
                                   hidden_size=self.hidden_dim,
                                   num_layers=self.num_layers,
                                   dropout=config['dropout'],
                                   bidirectional=True,
                                   batch_first=True)
            else:
                self.rnn = nn.GRU(input_size=self.feat_dim,
                                  hidden_size=self.hidden_dim,
                                  num_layers=self.num_layers,
                                  dropout=config['dropout'],
                                  bidirectional=True,
                                  batch_first=True)

            # In linear network, *2 for bidirectional
            self.hidden2phone = nn.Linear(self.hidden_dim * 2, self.output_dim)
        else:
            if self.rnn_name == 'LSTM':
                self.rnn = nn.LSTM(input_size=self.feat_dim,
                                   hidden_size=self.hidden_dim,
                                   num_layers=self.num_layers,
                                   dropout=config['dropout'],
                                   bidirectional=False,
                                   batch_first=True)
            else:
                self.rnn = nn.GRU(input_size=self.feat_dim,
                                  hidden_size=self.hidden_dim,
                                  num_layers=self.num_layers,
                                  dropout=config['dropout'],
                                  bidirectional=True,
                                  batch_first=True)

            # In linear network, *2 for bidirectional
            self.hidden2phone = nn.Linear(self.hidden_dim,
                                          self.output_dim)  # for pad token

        self.loss_func = torch.nn.CTCLoss(blank=self.blank_token_id,
                                          reduction='mean',
                                          zero_infinity=False)
        print("Using CTC loss")

        optimizer = config['train']['optim']
        if optimizer == 'SGD':
            self.optimizer = optim.SGD(self.parameters(),
                                       lr=config['train']['lr'],
                                       momentum=0.9)
        elif optimizer == 'Adam':
            self.optimizer = optim.Adam(self.parameters(),
                                        lr=config['train']['lr'])
Exemplo n.º 8
0
    def __init__(self, config, mode):

        super(customRNN, self).__init__(config)

        self.rnn_name = config['rnn']
        # Store important parameters
        self.phone_to_id = utils.load_phone_mapping(config)
        self.feat_dim = config['n_mfcc'] + config['n_fbank']
        self.hidden_dim, self.num_phones, = config['hidden_dim'], config[
            'num_phones']
        self.num_layers = config['num_layers']
        self.output_dim = self.num_phones + 2  # 1 for pad and 1 for blank
        self.blank_token_id = self.phone_to_id['BLANK']
        self.num_directions = 2 if config['bidirectional'] else 1

        dropout, r_dropout = config['dropout'], config['r_dropout']
        layer_norm, batch_norm = config['layerNorm'], config['batchnorm']

        if config['bidirectional']:
            if self.rnn_name == 'customLSTM':
                self.rnn = custom_rnn.LayerNormLSTM(self.feat_dim,
                                                    self.hidden_dim,
                                                    self.num_layers,
                                                    0.3,
                                                    0.3,
                                                    bidirectional=True,
                                                    layer_norm_enabled=True)
            elif self.rnn_name == 'customGRU':
                self.rnn = custom_rnn.customGRU(self.feat_dim,
                                                self.hidden_dim,
                                                self.num_layers,
                                                dropout=dropout,
                                                layer_norm_enabled=layer_norm,
                                                r_dropout=r_dropout,
                                                bidirectional=True)
            elif self.rnn_name == 'customliGRU':
                self.rnn = custom_rnn.customliGRU(self.feat_dim,
                                                  self.hidden_dim,
                                                  self.num_layers,
                                                  dropout=dropout,
                                                  bidirectional=True)

            # In linear network, *2 for bidirectional
            self.hidden2phone = nn.Linear(self.hidden_dim * 2, self.output_dim)
        else:
            if self.rnn_name == 'customLSTM':
                self.rnn = custom_rnn.LayerNormLSTM(self.feat_dim,
                                                    self.hidden_dim,
                                                    self.num_layers,
                                                    0.2,
                                                    0.2,
                                                    bidirectional=False,
                                                    layer_norm_enabled=True)
            elif self.rnn_name == 'customGRU':
                self.rnn = custom_rnn.customGRU(self.feat_dim,
                                                self.hidden_dim,
                                                self.num_layers,
                                                dropout=dropout,
                                                layer_norm_enabled=layer_norm,
                                                r_dropout=r_dropout,
                                                bidirectional=True)
            elif self.rnn_name == 'customliGRU':
                self.rnn = custom_rnn.customliGRU(self.feat_dim,
                                                  self.hidden_dim,
                                                  self.num_layers,
                                                  dropout=dropout,
                                                  bidirectional=False)

            # In linear network, *2 for bidirectional
            self.hidden2phone = nn.Linear(self.hidden_dim,
                                          self.output_dim)  # for pad token

        self.loss_func = torch.nn.CTCLoss(blank=self.blank_token_id,
                                          reduction='mean',
                                          zero_infinity=False)
        print("Using CTC loss")

        optimizer = config['train']['optim']

        if optimizer == 'SGD':
            self.optimizer = optim.SGD(self.parameters(),
                                       lr=config['train']['lr'],
                                       momentum=0.9)
        elif optimizer == 'Adam':
            self.optimizer = optim.Adam(self.parameters(),
                                        lr=config['train']['lr'])
Exemplo n.º 9
0
def batch_test(config,
               dec_type,
               top_n,
               num_templates,
               num_compares,
               num_none,
               results_dump_path,
               exp_factor=1):
    """
    Master function which carries out actual testing
    :param dec_type: max or CTC
    :param top_n: top-n sequences are considered
    :param num_templates: number of templates for each keyword
    :param num_compares: number of clips in which each keyword needs to be searched for
    :param num_none: number of clips in which none of the keywords is present
    :param pr_dump_name: dump precision recall values
    :param results_dump_path: dump comparison results so that c values can be tweaked easily
    :param wrong_pred_path: path to folder where txt files are stored
    :param exp_factor: weight assigned to probability score
    """

    keywords = [
        'academic', 'reflect', 'equipment', 'program', 'rarely', 'national',
        'social', 'movies', 'greasy', 'water'
    ]
    # keywords = [
    #     'oily', 'people', 'before', 'living', 'potatoes', 'children', 'overalls', 'morning', 'enough', 'system',
    #     'water', 'greasy', 'suit', 'dark', 'very', 'without', 'money', 'reflect', 'program',
    #     'national', 'social', 'water', 'carry', 'time', 'before', 'always', 'often', 'people', 'money',
    #     'potatoes', 'children']
    # keywords = ['oily', 'people', 'before', 'living', 'water', 'children']
    # keywords = ['like', 'carry', 'will', 'potatoes', 'before', 'government', 'economic', 'overalls', 'through', 'money',
    # 'children']

    test_case_name = 'test_cases_' + str(num_templates) + '_' + str(
        num_compares) + '_' + str(num_none) + '.pkl'
    pkl_name = os.path.join(config['dir']['pickle'], rnn_model.arch_name,
                            test_case_name)
    results_dump_path = os.path.join(config['dir']['pickle'],
                                     rnn_model.arch_name, results_dump_path)

    # generate cases to be tested on
    cases = gen_cases(['../datasets/TIMIT/TEST/', '../datasets/TIMIT/TRAIN/'],
                      '../datasets/TIMIT/TEST/', pkl_name, num_templates,
                      num_compares, num_none, keywords, config['gen_template'])

    infer_mode = config['infer_mode']

    if os.path.exists(results_dump_path):
        with open(results_dump_path, 'rb') as f:
            return pickle.load(f)
    else:

        a = test_model(config, cases)

        # Q values and probabilities are loaded. Important to load probability values from HERE since
        # they influence thresholds and Q-values
        qval_pth = os.path.join(config['dir']['pickle'], rnn_model.arch_name,
                                'final_q_vals.pkl')
        prob_pth = os.path.join(config['dir']['pickle'], rnn_model.arch_name,
                                'probs.pkl')

        (thresholds, insert_prob, delete_prob,
         replace_prob) = find_batch_q(qval_pth,
                                      prob_pth,
                                      dec_type,
                                      top_n,
                                      exp_factor,
                                      rnn_model=rnn_model)

        # dictionary for storing c values required to declare keyword
        final_results = {}
        for kw in cases.keys():
            final_results[kw] = {}

        # initialise model
        db = a.get_outputs()
        phone_to_id = utils.load_phone_mapping(config)
        id_to_phone = {v: k for k, v in phone_to_id.items()}

        # iterate over every clip and compare it with every template one-by-one
        # note that gr_phone_entire_clip is NOT USED
        for i, (output, length, gr_phone_entire_clip, word_in_clip,
                wav_path) in enumerate(db):

            if i % (len(db) // 10) == 0:
                print("On output:", str(i) + "/" + str(len(db)))

            cur_out = output[:length]

            # generate lattice from current predictions
            lattices = generate_lattice(cur_out,
                                        rnn_model.model.blank_token_id,
                                        dec_type, top_n)
            # compare with every template
            for template_word, templates in cases.items():

                # if no keyword, then continue
                if template_word == 'NONE':
                    continue

                templates = templates['templates']
                final_results[template_word][i] = {'data': [], 'metadata': []}

                for template_phones in templates:
                    # template phone sequence
                    template_phone_ids = [
                        phone_to_id[x] for x in template_phones
                    ]

                    (pred_phones,
                     node_prob), final_lattice = traverse_best_lattice(
                         lattices, dec_type, template_phone_ids, insert_prob,
                         delete_prob, replace_prob)
                    # out_for_cnn[word_in_clip].append((pred_phones, node_prob, word_in_clip == template_word))
                    # node probabilities of best lattice
                    substring_phones = [id_to_phone[x] for x in pred_phones]
                    final_lattice = [id_to_phone[x[0]] for x in final_lattice]

                    insert_prob_pow, delete_prob_pow, replace_prob_pow = np.power(insert_prob, exp_factor), \
                                                                         np.power(delete_prob, exp_factor), \
                                                                         np.power(replace_prob, exp_factor)

                    # calculate q values
                    q_vals = find_q_values(template_phone_ids, pred_phones,
                                           node_prob, insert_prob_pow,
                                           delete_prob_pow, replace_prob_pow)

                    metadata = (wav_path, word_in_clip, template_word,
                                gr_phone_entire_clip, final_lattice,
                                substring_phones, template_phones)
                    final_results[template_word][i]['metadata'].append(
                        metadata)

                    if infer_mode == 'group':
                        # sum up the predicted q values
                        predicted_log_val, gr_log_val = 0, 0

                        for pred_phone, vals in q_vals.items():
                            for val in vals:
                                predicted_log_val += np.log(val)
                            gr_log_val += (np.log(thresholds[pred_phone][0]) *
                                           len(vals))

                        if template_word == word_in_clip:
                            # gr_log_val should be < predicted_log_val + c
                            final_results[template_word][i]['data'].append(
                                ('right', gr_log_val, predicted_log_val))
                        else:
                            # gr_log_val should be > predicted_log_val + c
                            final_results[template_word][i]['data'].append(
                                ('wrong', gr_log_val, predicted_log_val))

                    elif infer_mode == 'indi':
                        above = 0
                        total_phones = 0
                        for pred_phone, vals in q_vals.items():
                            total_phones += len(vals)
                            for val in vals:
                                if val >= thresholds[pred_phone][0]:
                                    above += 1

                        if template_word == word_in_clip:
                            # gr_log_val should be < predicted_log_val + c
                            final_results[template_word][i]['data'].append(
                                ('right', above / total_phones))
                            # print('YES', above / total_phones)
                        else:
                            # gr_log_val should be > predicted_log_val + c
                            # print('NO', above / total_phones)
                            final_results[template_word][i]['data'].append(
                                ('wrong', above / total_phones))

                    else:
                        print("Infer Mode not defined")
                        exit(0)

        with open(results_dump_path, 'wb') as f:
            pickle.dump(final_results, f)
            print("Dumped final results of testing")

        return final_results