예제 #1
0
 def estimate_dist(s1, s2):
     ''' Estimate distance between sequences via a modified Hamming distance
         vs. edit distance '''
     n1 = len(s1)
     n2 = len(s2)
     if n1 == n2:  # same length, estimate distance as hamming distance
         dist = sum(c1 != c2 for c1, c2 in itertools.izip(seq1, seq2))
         print dist
         if dist > dist_limit:  # hamming distance large, try again with edit dist
             dist = edit_distance(seq1, seq2, limit=dist_limit)
     else:  # different length, compute edit distance
         dist = edit_distance(seq1, seq2)
     return dist
예제 #2
0
def exp_comb_1():
	'''
	Levenshtein distance
	we do not consider time information-align words

	'''
	mfl_1_parent_dir = 'plp/am/plp-bg'
	mfl_2_parent_dir = 'plp/am/plp-bg'
	mlf_1_pass = '******'
	mlf_2_pass = '******'
	episode = 'dev03_DEV001-20010117-XX2000'

	mlf1 = mfl_1_parent_dir + '/' +episode+ '/' + mlf_1_pass + '/rescore.mlf'
	mlf2 = mfl_2_parent_dir + '/' +episode+ '/' + mlf_2_pass + '/rescore.mlf'
	utl.edit_distance(mlf1, mlf2)
예제 #3
0
def main(args):
    non_lang_syms = []
    if args.non_lang_syms is not None:
        with open(args.non_lang_syms, 'r', encoding='utf-8') as f:
            non_lang_syms = [x.rstrip() for x in f.readlines()]

    word_filters = []
    if args.wer_output_filter is not None:
        with open(args.wer_output_filter, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line.startswith('#!') or line == '':
                    continue
                elif line.startswith('s/'):
                    m = re.match(r's/(\S+)/(\w*)/g', line)
                    assert m is not None
                    word_filters.append([m.group(1), m.group(2)])
                elif line.startswith('s:'):
                    m = re.match(r's:(\S+):(\w*):g', line)
                    assert m is not None
                    word_filters.append([m.group(1), m.group(2)])
                else:
                    print('Unsupported pattern: "{}", ignored'.format(line),
                          file=sys.stderr)

    refs = {}
    with open(args.ref_text, 'r', encoding='utf-8') as f:
        for line in f:
            utt_id, text = line.strip().split(None, 1)
            assert utt_id not in refs, utt_id
            refs[utt_id] = text

    wer_counter = Counter()
    with open(args.hyp_text, 'r', encoding='utf-8') as f:
        for line in f:
            utt_id, text = line.strip().split(None, 1)
            assert utt_id in refs, utt_id
            ref, hyp = refs[utt_id], text

            # filter words according to word_filters (support re.sub only)
            for pattern, repl in word_filters:
                ref = re.sub(pattern, repl, ref)
                hyp = re.sub(pattern, repl, hyp)

            # filter out any non_lang_syms from ref and hyp
            ref_list = [x for x in ref.split() if x not in non_lang_syms]
            hyp_list = [x for x in hyp.split() if x not in non_lang_syms]

            _, _, counter = edit_distance(ref_list, hyp_list)
            wer_counter += counter

    assert wer_counter['words'] > 0
    wer = float(wer_counter['sub'] + wer_counter['ins'] + \
        wer_counter['del']) / wer_counter['words'] * 100
    sub = float(wer_counter['sub']) / wer_counter['words'] * 100
    ins = float(wer_counter['ins']) / wer_counter['words'] * 100
    dlt = float(wer_counter['del']) / wer_counter['words'] * 100

    print('WER={:.2f}%, Sub={:.2f}%, Ins={:.2f}%, Del={:.2f}%, #words={:d}'.
          format(wer, sub, ins, dlt, wer_counter['words']))
예제 #4
0
 def get_similar(self, word, tolerance):
     similar = []
     for wd in self.words:
         distance = edit_distance(wd, word)
         if distance <= tolerance:
             similar.append(wd)
     return similar
예제 #5
0
 def recursive_get_similar(tree, index):
     distance = edit_distance(tree[index][0], word)
     if distance <= tolerance:
         similar.append(tree[index][0])
     for i in range(distance - tolerance, distance + tolerance + 1):
         if i in tree[index][1]:
             recursive_get_similar(tree, tree[index][1][i])
예제 #6
0
 def recursive_add(tree, index):
     if tree[index][0] == '':
         tree[index][0] = word
     else:
         distance = edit_distance(tree[index][0], word)
         if distance != 0:
             if distance not in tree[index][1]:
                 tree[index][1][distance] = len(tree)
                 tree.append(['', {}])
             recursive_add(tree, tree[index][1][distance])
예제 #7
0
 def equal_queries(self, queries1, queries2):
     n = len(queries1)
     if n != len(queries2):
         return False
     for i in xrange(n):
         if utils.edit_distance(
                 queries1[i]['raw'], queries2[i]['raw'],
                 EDIT_DISTANCE_THRESHOLD) > EDIT_DISTANCE_THRESHOLD:
             return False
     return True
예제 #8
0
 def add(self, word):
     index = 0
     while self.tree[index][0] != '':
         distance = edit_distance(self.tree[index][0], word)
         if distance == 0:
             return
         if distance not in self.tree[index][1]:
             self.tree[index][1][distance] = len(self.tree)
             self.tree.append(['', {}])
         index = self.tree[index][1][distance]
     self.tree[index][0] = word
예제 #9
0
 def get_similar(self, word, tolerance):
     similar, st = [], [0]
     while st:
         index = st.pop()
         distance = edit_distance(self.tree[index][0], word)
         if distance <= tolerance:
             similar.append(self.tree[index][0])
         for i in range(distance - tolerance, distance + tolerance + 1):
             if i in self.tree[index][1]:
                 st.append(self.tree[index][1][i])
     return similar
예제 #10
0
    def match_address(self):
        """
        Match visitors address fields.
        :return score: calculated address similarity score based on all attributes
        """
        new_address = self.new_v["visitor_addresses"]
        match_scores = []
        for prev_v in self.prev_vs:
            prev_address = prev_v["visitor_addresses"]
            results = {
                "Line1": 1.0 - edit_distance(prev_address["Line1"], new_address["Line1"]),
                "Line2": 1.0 - edit_distance(prev_address["Line2"], new_address["Line2"]),
                "City": exact_match(prev_address["City"], new_address["City"]),
                "Country": exact_match(prev_address["Country"], new_address["Country"]),
                "Postal_code": exact_match(prev_address["Postal_code"], new_address["Postal_code"]),
                "State": exact_match(prev_address["State"], new_address["State"]),
            }

            match_scores.append(generate_match_score(results, self.weights["visitor_addresses"]))

        return max(match_scores)
예제 #11
0
    def _get_reward(self, offset=3):
        golden_standard_db = self.golden_standard_db

        data_cur = []

        if golden_standard_db[0][0] is None:
            print("THE GOLD STANDARD IS MORE LIKE SILVER...[?] HMMM")
            #print(self.current_data)
            #print(self.golden_standard_db)
            try:
                sys.exit(-1)
            except SystemExit:
                os._exit(-2)
        else:
            tmp = golden_standard_db[0][0].lower().replace(' ', '')

        golden_standard_db = [(tmp, golden_standard_db[0][1])]
        """
        data_cur.append((tup[0][0].lower().replace(' ', ''), tup[0][1]))AttributeError: 'spacy.tokens.span.Span' object has no attribute 'lower'
        """

        for tup in self.current_db:
            data_cur.append((str(tup[0][0]).lower().replace(' ',
                                                            ''), tup[0][1]))

        a = set(golden_standard_db)

        if len(a) == 0:
            print("Well josue, the world is weird")
            try:
                print("ERROR IN THE FUNCTION _get_reward()")
                sys.exit(-1)
            except SystemExit:
                os._exit(-2)

        # TODO: PA: it shouldn't be the extracted NER from the snippet in self.current_data ?
        b = set(data_cur)

        # Jaccard index - penalty
        # penalty =  e^(alpha * len(b)) * u(len(b)-offset) + min (edit_distance(A,B)) / len(A_content)
        edit_vect = np.array(utils.edit_distance(a, b))  # Range: [0, inf)

        penalty = m.pow(
            m.e, self.alpha_reward * len(b)) * utils.step(len(b) - offset)
        penalty += edit_vect.mean() / utils.len_content(a)
        reward_cur = (len(a.intersection(b)) / len(a.union(b))) - penalty

        reward = reward_cur - self.reward_prev
        self.reward_prev = reward_cur

        return reward
예제 #12
0
 def _correct(self, line):
     '''
     将编辑距离小于阈值的词进行替换
     '''
     flag = False
     for word in constants.CORRECT_WORDS:
         word_pinyin = ''.join(lazy_pinyin(word))
         segged_words = self._seg_sentence(word, line)
         for w in segged_words:
             w_pinyin = ''.join(lazy_pinyin(w))
             if utils.edit_distance(w_pinyin, word_pinyin) < constants.DISTANCE:
                 line = line.replace(w, constants.RIGHT_WORD)
                 flag = True
                 break
         if flag:
             break
     return line
예제 #13
0
    def parse_database(self) -> dict():
        self._model = dict()

        for table_name in tqdm(self.table_names):
            df = self.read_table(table_name)

            self._model[table_name] = {}
            col_types = [
                df.iloc[0, index].__class__.__name__ for index in range(len(df.columns))
            ]

            smallest_pk_edit_distance = float("inf")
            previous_pk_field = None
            for field_name, field_type in zip(df.columns, col_types):
                self._model[table_name][field_name] = {}
                self._model[table_name][field_name]["type"] = field_type

                # extract primary keys
                if field_name[:3] == "pk_":
                    # possible primary key
                    distance = edit_distance(field_name[3:-3], table_name)
                    if distance < smallest_pk_edit_distance:
                        # closest primary key so far
                        if previous_pk_field is not None:
                            # remove previously assumed pk
                            self._model[table_name][previous_pk_field]["pk"] = False

                        # set current pk
                        self._model[table_name][field_name]["pk"] = True
                        previous_pk_field = field_name
                        smallest_pk_edit_distance = distance
                    else:
                        self._model[table_name][field_name]["pk"] = False
                else:
                    self._model[table_name][field_name]["pk"] = False

                # extract foreign keys
                if field_name[:3] == "fk_":
                    self._model[table_name][field_name]["fk"] = True
                    self._model[table_name][field_name][
                        "fk_table"
                    ] = self.get_tablename_from_fieldname(field_name)
                else:
                    self._model[table_name][field_name]["fk"] = False

        return self._model
def get_pk_and_fk_from_table(fmp: FMP, df: pd.DataFrame, table_name: str):
    pk = None
    fks = []
    smallest_pk_edit_distance = float("inf")
    for field_name in df.columns:
        # extract primary keys
        if field_name[:3] == "pk_" and field_name[-3:] == "_id":
            # possible primary key
            distance = edit_distance(field_name[3:-3], table_name)
            if distance < smallest_pk_edit_distance:
                pk = field_name
                smallest_pk_edit_distance = distance

        # extract foreign keys
        if field_name[:3] == "fk_" and field_name[-3:] == "_id":
            fks.append((field_name, fmp.get_tablename_from_fieldname(field_name)))

    return pk, fks
예제 #15
0
 def handle_category(self, cat, cht):
     '''
     Sees whether the category written already exists
     And prompts the user for the action to take if
     it doesn't.
     :param cat: the relevant category
     :param cht: the relevant cheatsheet
     :return: the destined name of the category, boolean to indicate
              whether an existing category was chosen or not
     '''
     if not cat[0].isupper():
         cat = cat[0].upper() + cat[1:]
     if cat in cht.keys():
         # category has been found and we can move on
         pass
     else:
         # recommend most similar categories or choose new one
         keys = [k for k in cht.keys() if k not in ["START", "END"]]
         recommendations = sorted([(i, utils.edit_distance(k, cat))
                                   for i, k in enumerate(keys)],
                                  key=itemgetter(1))
         self.add_msg("Category " + cat +
                      " not recognized. Choose one of following:")
         self.add_msg("-1 = add new category")
         for i, rec in recommendations[:min(3, len(keys))]:
             self.add_msg(str(i) + " = " + keys[i])
         inpt = self.get_input('Choice')
         inpt = int(inpt)
         if inpt == -1:
             return cat, False
         else:
             if inpt >= 0 and inpt < len(keys):
                 cat = keys[inpt]
             else:
                 self.add_msg("Invalid choice. Exiting program")
                 exit()
     return cat, True
예제 #16
0
    def suggest_word(self, token: str):
        bigrams_token = get_bigrams(token)

        possible_similar_words = set()
        for bigram in bigrams_token:
            possible_similar_words = possible_similar_words.union(
                self.bigram_index.index[bigram])

        jaccard_sims = []
        for word in possible_similar_words:
            jaccard_sims.append((word,
                                 jaccard_similarity(set(bigrams_token),
                                                    set(get_bigrams(word)))))

        # sorting the possibly similar words based on their jaccard distance to the main token
        jaccard_sims = sorted(jaccard_sims, key=lambda x: x[1], reverse=True)

        similar_words = jaccard_sims[:
                                     5]  # similar words with their jaccard distance to the main token
        distances = [(t[0], edit_distance(token, t[0])) for t in similar_words]
        distances = sorted(distances, key=lambda x: x[1])
        correct_word = distances[0][0]

        return correct_word
예제 #17
0
        val_ed = 0
        val_len = 0
        val_count = 0
            
        while val_idx < validation_size:
            mini_batch_feed_dict = {
                inputs: validation_batch['inputs'][val_idx:val_idx+params['batch_size']],
                seq_len: validation_batch['seq_lengths'][val_idx:val_idx+params['batch_size']],
                rnn_keep_prob: 1.0            
            }            
                        
            
            prediction = sess.run(decoded,
                                  mini_batch_feed_dict)
    
            str_predictions = utils.sparse_tensor_to_strs(prediction)
    

            for i in range(len(str_predictions)):
                ed = utils.edit_distance(str_predictions[i], validation_batch['targets'][val_idx+i])
                val_ed = val_ed + ed
                val_len = val_len + len(validation_batch['targets'][val_idx+i])
                val_count = val_count + 1
                
            val_idx = val_idx + params['batch_size']
    
        print ('[Epoch ' + str(epoch) + '] ' + str(1. * val_ed / val_count) + ' (' + str(100. * val_ed / val_len) + ' SER) from ' + str(val_count) + ' samples.')        
        print ('Saving the model...')
        saver.save(sess,args.save_model,global_step=epoch)
        print ('------------------------------')
예제 #18
0
def las_model_fn(features,
                 labels,
                 mode,
                 config,
                 params,
                 binf2phone=None,
                 run_name=None):
    encoder_inputs = features['encoder_inputs']
    source_sequence_length = features['source_sequence_length']

    decoder_inputs, decoder_inputs_binf = None, None
    targets = None
    target_sequence_length = None
    targets_binf = None
    binf_embedding = None
    if binf2phone is not None and params.decoder.binary_outputs:
        binf_embedding = tf.constant(binf2phone,
                                     dtype=tf.float32,
                                     name='binf2phone')

    mapping = None
    if params.mapping and binf_embedding is not None:
        mapping = tf.convert_to_tensor(params.mapping)

    if mode != tf.estimator.ModeKeys.PREDICT:
        decoder_inputs = labels['targets_inputs']
        targets = labels['targets_outputs']
        if mapping is not None:
            decoder_inputs = tf.nn.embedding_lookup(mapping, decoder_inputs)
            targets = tf.nn.embedding_lookup(mapping, targets)
        target_sequence_length = labels['target_sequence_length']
        if binf_embedding is not None:
            targets_binf = tf.nn.embedding_lookup(tf.transpose(binf_embedding),
                                                  targets)
            decoder_inputs_binf = tf.nn.embedding_lookup(
                tf.transpose(binf_embedding), decoder_inputs)

    tf.logging.info('Building listener')
    with tf.variable_scope('listener'):
        (encoder_outputs,
         source_sequence_length), encoder_state = las.model.listener(
             encoder_inputs, source_sequence_length, mode, params.encoder)

    tf.logging.info('Building speller')
    decoder_outputs, final_context_state, final_sequence_length = None, None, None
    if not params.decoder.binary_outputs or params.decoder.multitask:
        with tf.variable_scope('speller'):
            decoder_outputs, final_context_state, final_sequence_length = las.model.speller(
                encoder_outputs, encoder_state, decoder_inputs,
                source_sequence_length, target_sequence_length, mode,
                params.decoder)

    decoder_outputs_binf, final_context_state_binf, final_sequence_length_binf = None, None, None
    if params.decoder.binary_outputs:
        with tf.variable_scope('speller_binf'):
            decoder_outputs_binf, final_context_state_binf, final_sequence_length_binf = las.model.speller(
                encoder_outputs, encoder_state, decoder_inputs_binf
                if not params.decoder.binf_projection else decoder_inputs,
                source_sequence_length, target_sequence_length, mode,
                params.decoder, not params.decoder.binf_projection,
                binf_embedding if not params.decoder.binf_sampling
                or params.decoder.beam_width > 0 else None)

    sample_ids_phones_binf, sample_ids_phones, sample_ids_binf, logits_binf, logits = None, None, None, None, None
    with tf.name_scope('prediction'):
        if mode == tf.estimator.ModeKeys.PREDICT and params.decoder.beam_width > 0:
            logits = tf.no_op()
            if decoder_outputs is not None:
                sample_ids_phones = decoder_outputs.predicted_ids
            if decoder_outputs_binf is not None:
                sample_ids_phones_binf = decoder_outputs_binf.predicted_ids
        else:
            if decoder_outputs is not None:
                logits = decoder_outputs.rnn_output
                sample_ids_phones = tf.to_int32(tf.argmax(logits, -1))
            if decoder_outputs_binf is not None:
                logits_binf = decoder_outputs_binf.rnn_output
                if params.decoder.binary_outputs and params.decoder.binf_sampling:
                    logits_phones_binf = transform_binf_to_phones(
                        logits_binf, binf_embedding)
                    sample_ids_phones_binf = tf.to_int32(
                        tf.argmax(logits_phones_binf, -1))
                else:
                    sample_ids_phones_binf = tf.to_int32(
                        tf.argmax(logits_binf, -1))

    if mode == tf.estimator.ModeKeys.PREDICT:
        emb_c = tf.concat([x.c for x in encoder_state], axis=1)
        emb_h = tf.concat([x.h for x in encoder_state], axis=1)
        emb = tf.stack([emb_c, emb_h], axis=1)
        predictions = {
            'embedding': emb,
            'encoder_out': encoder_outputs,
            'source_length': source_sequence_length
        }
        if sample_ids_phones is not None:
            predictions['sample_ids'] = sample_ids_phones
        if logits_binf is not None:
            predictions['logits_binf'] = logits_binf
        if sample_ids_phones_binf is not None:
            predictions['sample_ids_phones_binf'] = sample_ids_phones_binf

        if final_context_state is not None:
            predictions['alignment'] = get_alignment_history(
                final_context_state, params)
        if final_context_state_binf is not None:
            predictions['alignment_binf'] = get_alignment_history(
                final_context_state_binf, params)

        if params.decoder.beam_width == 0:
            if params.decoder.binary_outputs and binf_embedding is None:
                predictions['probs'] = tf.nn.sigmoid(logits_binf)
            elif logits is not None:
                predictions['probs'] = tf.nn.softmax(logits)
            else:
                predictions['probs'] = tf.nn.softmax(logits_binf)

        return tf.estimator.EstimatorSpec(mode, predictions=predictions)

    edit_distance, edit_distance_binf = None, None
    with tf.name_scope('metrics'):
        if sample_ids_phones is not None:
            edit_distance = utils.edit_distance(
                sample_ids_phones, targets, utils.EOS_ID,
                params.mapping if mapping is None else None)
        if sample_ids_phones_binf is not None:
            edit_distance_binf = utils.edit_distance(
                sample_ids_phones_binf, targets, utils.EOS_ID,
                params.mapping if mapping is None else None)
        metrics = {
            'edit_distance':
            tf.metrics.mean(edit_distance if edit_distance is not None else
                            edit_distance_binf),
        }

    # In TRAIN model this becomes an significantly affected by early high values.
    # As a result in summaries train values would be high and drop after restart.
    # To prevent this, we use last batch average in case of TRAIN.
    if mode != tf.estimator.ModeKeys.TRAIN:
        tf.summary.scalar('edit_distance', metrics['edit_distance'][1])
    elif not params.tpu_name:
        tf.summary.scalar(
            'edit_distance',
            tf.reduce_mean(edit_distance if edit_distance is not None else
                           edit_distance_binf))

    audio_loss_ipa, audio_loss_binf = None, None
    if logits is not None:
        with tf.name_scope('cross_entropy'):
            audio_loss_ipa = compute_loss(logits, targets,
                                          final_sequence_length,
                                          target_sequence_length, mode)

    if logits_binf is not None:
        with tf.name_scope('cross_entropy_binf'):
            if params.decoder.binf_projection:
                audio_loss_binf = compute_loss(logits_binf, targets,
                                               final_sequence_length_binf,
                                               target_sequence_length, mode)
            else:
                if mode == tf.estimator.ModeKeys.TRAIN:
                    audio_loss_binf = compute_loss_sigmoid(
                        logits_binf, targets_binf, final_sequence_length_binf,
                        target_sequence_length, mode)
                else:
                    audio_loss_binf = compute_loss_sigmoid(
                        logits_binf, targets, final_sequence_length_binf,
                        target_sequence_length, mode)

    audio_loss = 0
    if audio_loss_ipa is not None:
        audio_loss += audio_loss_ipa
    if audio_loss_binf is not None:
        audio_loss += audio_loss_binf
        tf.summary.scalar('audio_loss_binf', audio_loss_binf)

    ctc_edit_distance = None
    if params.ctc_weight > 0:
        ctc_logits = tf.layers.dense(encoder_outputs,
                                     params.decoder.target_vocab_size + 1,
                                     activation=None,
                                     name='ctc_logits')
        decoded_ctc, _ = tf.nn.ctc_greedy_decoder(
            tf.transpose(ctc_logits, [1, 0, 2]), source_sequence_length)
        decoded_ctc = tf.sparse.to_dense(decoded_ctc[0])
        decoded_ctc = tf.cast(decoded_ctc, tf.int32)
        if target_sequence_length is not None:
            ctc_loss = tf.nn.ctc_loss_v2(labels=targets,
                                         logits=ctc_logits,
                                         logits_time_major=False,
                                         label_length=target_sequence_length,
                                         logit_length=source_sequence_length)
            ctc_loss = tf.reduce_mean(ctc_loss, name='ctc_phone_loss')
            audio_loss += ctc_loss * params.ctc_weight
            tf.summary.scalar('ctc_loss', ctc_loss)
            with tf.name_scope('ctc_metrics'):
                ctc_edit_distance = utils.edit_distance(
                    decoded_ctc, targets, utils.EOS_ID,
                    params.mapping if mapping is None else None)
                metrics['ctc_edit_distance'] = tf.metrics.mean(
                    ctc_edit_distance)
            if mode != tf.estimator.ModeKeys.TRAIN:
                tf.summary.scalar('ctc_edit_distance',
                                  metrics['ctc_edit_distance'][1])
            else:
                tf.summary.scalar('ctc_edit_distance',
                                  tf.reduce_mean(ctc_edit_distance))

    if mode == tf.estimator.ModeKeys.EVAL:
        with tf.name_scope('alignment'):
            attention_images = utils.create_attention_images(
                final_context_state or final_context_state_binf)

        run_name = run_name or 'eval'
        if run_name != 'eval':
            # For other summaries eval is automatically added.
            run_name = 'eval_{}'.format(run_name)
        attention_summary = tf.summary.image('attention_images',
                                             attention_images)
        eval_summary_hook = tf.train.SummarySaverHook(
            save_steps=20,
            output_dir=os.path.join(config.model_dir, run_name),
            summary_op=attention_summary)
        hooks = [eval_summary_hook]
        loss = audio_loss
        log_data = {
            'edit_distance':
            tf.reduce_mean(edit_distance if edit_distance is not None else
                           edit_distance_binf),
            'max_edit_distance':
            tf.reduce_max(edit_distance if edit_distance is not None else
                          edit_distance_binf),
            'min_edit_distance':
            tf.reduce_min(edit_distance
                          if edit_distance is not None else edit_distance_binf)
        }
        logging_hook = tf.train.LoggingTensorHook(log_data, every_n_iter=20)
        hooks += [logging_hook]

        return tf.estimator.EstimatorSpec(mode,
                                          loss=loss,
                                          eval_metric_ops=metrics,
                                          evaluation_hooks=hooks)

    with tf.name_scope('train'):
        optimizer = tf.train.AdamOptimizer(params.learning_rate)
        if params.tpu_name and params.tpu_name != 'fake':
            optimizer = tf.tpu.CrossShardOptimizer(optimizer)
        var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
        total_params = np.sum([np.prod(x.shape.as_list()) for x in var_list])
        tf.logging.info('Trainable parameters: {}'.format(total_params))

        regularizer = tf_contrib.layers.l2_regularizer(params.l2_reg_scale)
        reg_term = tf.contrib.layers.apply_regularization(
            regularizer, var_list)
        audio_loss = audio_loss + reg_term

        gvs = optimizer.compute_gradients(audio_loss, var_list=var_list)
        capped_gvs = [(tf.clip_by_norm(grad, GRAD_NORM), var)
                      for grad, var in gvs]
        train_op = optimizer.apply_gradients(
            capped_gvs, global_step=tf.train.get_global_step())
        if params.add_noise > 0:

            def add_noise():
                noise_ops = [train_op]
                for var in var_list:
                    if var.name.endswith('kernel:0'):
                        shape = tf.shape(var)
                        noise_op = tf.assign_add(
                            var,
                            tf.random_normal(shape,
                                             NOISE_MEAN,
                                             params.noise_std,
                                             dtype=tf.float32))
                        noise_ops.append(noise_op)
                print_op = tf.print('Adding noise to weights')
                return tf.group(*noise_ops, print_op)

            train_op = tf.cond(
                tf.logical_and(
                    tf.equal(
                        tf.mod(tf.train.get_global_step(),
                               params.add_noise), 0),
                    tf.greater(tf.train.get_global_step(), 0)), add_noise,
                lambda: train_op)

    loss = audio_loss
    train_log_data = {
        'loss':
        loss,
        'edit_distance':
        tf.reduce_mean(
            edit_distance if edit_distance is not None else edit_distance_binf)
    }
    if ctc_edit_distance is not None:
        train_log_data['ctc_edit_distance'] = tf.reduce_mean(ctc_edit_distance)
    logging_hook = tf.train.LoggingTensorHook(train_log_data, every_n_iter=10)

    if not params.tpu_name:
        return tf.estimator.EstimatorSpec(mode,
                                          loss=loss,
                                          train_op=train_op,
                                          training_hooks=[logging_hook])
    else:
        return tf.estimator.tpu.TPUEstimatorSpec(mode,
                                                 loss=loss,
                                                 train_op=train_op)
                    return process_name, process_status, process_parent_name, process_parent_pid

        except (psutil.NoSuchProcess, psutil.AccessDenied,
                psutil.ZombieProcess):
            pass


if __name__ == "__main__":

    #print find_process_parent_info('firefox')
    #print find_process_by_name('firefox')
    #print check_process_status('firefox')

    # get the process names
    critical_process_list = list()
    for proc in psutil.process_iter():
        p_name = proc.name()
        critical_process_list.append(p_name)

    # loop through the list and print
    #for entry in critical_process_list:
    #    print entry

    # find out list of critical linux process ( See CIS benchmarks, NIST guides)
    for entry in critical_process_list:
        for proc in psutil.process_iter():
            p_name = proc.name()
            process_distance = utils.edit_distance(p_name.lower(), entry)
            if process_distance < 5:
                print entry, p_name, process_distance
예제 #20
0
                                                 sample_img.shape[1], 1)

    # LENGTH
    length = [batch_image.shape[2] / WIDTH_REDUCTION]

    # PREDICTION
    prediction = sess.run(decoded, {
        inputs: batch_image,
        seq_len: length,
        rnn_keep_prob: 1.0
    })

    str_predictions = utils.sparse_tensor_to_strs(prediction)

    # EVALUATION
    ed = utils.edit_distance(str_predictions[0], label)
    if ed != 0:
        val_err = val_err + 1
    val_ed = val_ed + ed
    val_len = val_len + len(label)
    val_count = val_count + 1

    # Counter
    val_idx = val_idx + 1

print('Samples: ' + str(val_count))
print('Acc Err: ' + str(val_err) + ' (Avg. Err: ' +
      str(1. * val_err / val_count) + ')')
print('Acc Ed: ' + str(val_ed) + ' (Avg. Ed: ' + str(1. * val_ed / val_count) +
      ')')
print('SER: ' + str(100. * val_ed / val_len))
예제 #21
0
def las_model_fn(features, labels, mode, config, params):

    encoder_inputs = features['encoder_inputs']
    source_sequence_length = features['source_sequence_length']

    decoder_inputs = None
    targets = None
    target_sequence_length = None

    if mode != tf.estimator.ModeKeys.PREDICT:
        decoder_inputs = labels['targets_inputs']
        targets = labels['targets_outputs']
        target_sequence_length = labels['target_sequence_length']

    tf.logging.info('Building listener')

    with tf.variable_scope('listener'):
        (encoder_outputs,
         source_sequence_length), encoder_state = las.model.listener(
             encoder_inputs, source_sequence_length, mode, params.encoder)

    tf.logging.info('Building speller')

    with tf.variable_scope('speller'):
        decoder_outputs, final_context_state, final_sequence_length = las.model.speller(
            encoder_outputs, encoder_state, decoder_inputs,
            source_sequence_length, target_sequence_length, mode,
            params.decoder)

    with tf.name_scope('prediction'):
        if mode == tf.estimator.ModeKeys.PREDICT and params.decoder.beam_width > 0:
            logits = tf.no_op()
            sample_ids = decoder_outputs.predicted_ids
        else:
            logits = decoder_outputs.rnn_output
            sample_ids = tf.to_int32(tf.argmax(logits, -1))

    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            'sample_ids': sample_ids,
        }

        return tf.estimator.EstimatorSpec(mode, predictions=predictions)

    with tf.name_scope('metrics'):
        edit_distance = utils.edit_distance(sample_ids, targets, utils.EOS_ID,
                                            params.mapping)

        metrics = {
            'edit_distance': tf.metrics.mean(edit_distance),
        }

    tf.summary.scalar('edit_distance', metrics['edit_distance'][1])

    with tf.name_scope('cross_entropy'):
        loss = compute_loss(logits, targets, final_sequence_length,
                            target_sequence_length, mode)

    if mode == tf.estimator.ModeKeys.EVAL:
        with tf.name_scope('alignment'):
            attention_images = utils.create_attention_images(
                final_context_state)

        attention_summary = tf.summary.image('attention_images',
                                             attention_images)

        eval_summary_hook = tf.train.SummarySaverHook(
            save_steps=10,
            output_dir=os.path.join(config.model_dir, 'eval'),
            summary_op=attention_summary)

        logging_hook = tf.train.LoggingTensorHook(
            {
                'edit_distance': tf.reduce_mean(edit_distance),
                'max_edit_distance': tf.reduce_max(edit_distance),
                'max_predictions': sample_ids[tf.argmax(edit_distance)],
                'max_targets': targets[tf.argmax(edit_distance)],
                'min_edit_distance': tf.reduce_min(edit_distance),
                'min_predictions': sample_ids[tf.argmin(edit_distance)],
                'min_targets': targets[tf.argmin(edit_distance)],
            },
            every_n_iter=10)

        return tf.estimator.EstimatorSpec(
            mode,
            loss=loss,
            eval_metric_ops=metrics,
            evaluation_hooks=[logging_hook, eval_summary_hook])

    with tf.name_scope('train'):
        optimizer = tf.train.AdamOptimizer(params.learning_rate)
        train_op = optimizer.minimize(loss,
                                      global_step=tf.train.get_global_step())

    logging_hook = tf.train.LoggingTensorHook(
        {
            'loss': loss,
            'edit_distance': tf.reduce_mean(edit_distance),
            #'max_edit_distance': tf.reduce_max(edit_distance),
            #'predictions': sample_ids[tf.argmax(edit_distance)],
            #'targets': targets[tf.argmax(edit_distance)],
        },
        every_n_secs=10)

    return tf.estimator.EstimatorSpec(mode,
                                      loss=loss,
                                      train_op=train_op,
                                      training_hooks=[logging_hook])
예제 #22
0
def allele_analysis_differential_carbon_source(query_dir,
                                               expt_source,
                                               control_source='glc__D',
                                               ref_dir='reference/',
                                               rare_limit=-1,
                                               low_mem=True,
                                               write_log=False):
    ''' 
    Runs the following analysis for a given model 
    1) Simulate on an experimental carbon source and a control carbon source
    2) Identify reactions active only in the experimental conditions 
    3) Identify genes associated with the differential reactions 
    4) Compare upstream and coding sequences of those genes to reference strains

    Expects there to be four files in query-dir: <strain>.faa, <strain>.json,
    <strain>_cdhit_merged.faa.clstr, and <strain>_upstream.fna.

    If expt_source=None, does not do a differential analysis but instead 
    analyzes all genes as in steps 3/4.

    Parameters
    ----------
    query_dir : str
        Directory with model, upstream, and CD-Hit cluster files.
    expt_source : str
        Metabolite ID of carbon source to compare against control carbon source.
        Alternatively, if a list of reactionIDs are provided, uses those instead of
        attempting to identify differentially active reactions. Alternatively again,
        if None, will examine all reactions/genes.
    control_source : str
        Metabolite ID of carbon source to use as baseline (default glc__D)
    ref_dir : str
        Directory with reference materials, refer to recon.ref (default reference/)
    rare_limit : int
        If positive, only reports model alleles that have been observed at most 
        rare_limit times in the cluster file, i.e. if rare_limit = 1, only reports 
        alleles that have never been observed among reference strains. Reports 
        all if negative (default -1).
    low_mem : bool
        If True, only stores relevant sequences from reference genomes in memory by
        filtering by header; runs notably slower (default True)
    write_log : bool
        If True, saves/overwrites the allele report file in query_dir (default False)
    '''
    ''' Load files relevant to the query  '''
    for filename in os.listdir(query_dir):
        if filename[-4:] == '.faa':
            strain = filename[:-4]
    log_file = query_dir + '/' + strain + '_allele_report.txt'
    protein_file = query_dir + '/' + strain + '.faa'
    model_file = query_dir + '/' + strain + '.json'
    upstream_file = query_dir + '/' + strain + '_upstream.fna'
    cluster_file = query_dir + '/' + strain + '_cdhit_merged.faa.clstr'
    for filename in [protein_file, model_file, upstream_file, cluster_file]:
        if not os.path.exists(filename):
            print 'FILE IS MISSING, ABORTING:', filename
            return
    ''' Prepare output file '''
    if write_log:
        log_f = open(log_file, 'w+')

    def log_to_file(*argv):
        line = ' '.join(map(str, argv))
        print line  # print to console first
        if write_log:
            log_f.write(line + '\n')  # write to file

    ''' Extract all co-clustered reference genes '''

    def query_fxn(feature_name):
        return feature_name[:5] != '>lcl|'

    query_to_cluster = get_co_clustered(cluster_file, query_fxn)
    log_to_file('Loaded query clusters:', len(query_to_cluster))
    ''' Simulate differential growth '''
    model = cobra.io.load_json_model(model_file)
    if type(expt_source
            ) == str:  # analyzing differential genes WRT carbon source
        expt_diff_genes, expt_diff_rxns = get_differential_reactions(
            model, expt_source, control_source)
        # for i in range(iters - 1):
        #     diff_genes = get_differential_reactions(model, expt_source, control_source)
        #     for gene in expt_diff_genes.keys(): # only records genes that are differential across multiple runs
        #         if not gene in diff_genes:
        #             print 'Marginal diff:', gene
        #             del expt_diff_genes[gene]
        log_to_file('Found differentially active reactions:',
                    len(expt_diff_rxns))
        log_to_file(sorted(expt_diff_rxns))
        log_to_file('Found differentially active genes:', len(expt_diff_genes))
        log_to_file(sorted(expt_diff_genes.keys()))
    elif type(expt_source) in [
            list, set, tuple
    ]:  # pre-determined set of reactions (need a better way to check iterables)
        expt_diff_genes = get_gene_mapping_for_reactions(model, expt_source)
        log_to_file('Found differentially active genes:', len(expt_diff_genes))
        log_to_file(sorted(expt_diff_genes.keys()))
    else:  # analyzing all genes
        expt_diff_genes = get_gene_mapping_for_reactions(
            model, map(lambda x: x.id, model.reactions))
        log_to_file('Analyzing all genes:', len(expt_diff_genes))
    ''' Extract locus tags for relevant reference genes '''
    header_to_label = {}  # map raw headers (without ">") to labels
    with open(ref_dir + '/ref_labels.tsv', 'r') as f:
        for line in f:
            name, label = line.split()
            header_to_label[name] = label

    matched_diff_queries = {
    }  # query_to_cluster reduced to just differentially active genes
    diff_clustered_ref_names = set(
    )  # set of all reference genes co-clustered with a differential gene
    for match_gene in expt_diff_genes:
        query_gene = expt_diff_genes[match_gene][0]
        matched_diff_queries[query_gene] = query_to_cluster['>' + query_gene]
        diff_clustered_ref_names = diff_clustered_ref_names.union(
            query_to_cluster['>' + query_gene])
    del query_to_cluster
    diff_clustered_ref_loci = map(
        lambda x: header_to_label[x[1:]].split('|')[1],
        diff_clustered_ref_names)
    log_to_file('Found co-clustered reference genes:',
                len(diff_clustered_ref_loci))
    ''' Extract relevant sequences from reference and query files '''

    def filter_ref_seq(header):  # identifying reference protein sequences
        if len(header.strip()) == 0:  # catch empty lines
            return False
        return header.split()[0] in diff_clustered_ref_names

    def filter_ref_upstream(
            header):  # identifying reference upstream sequences
        if len(header.strip()) == 0:  # catch empty lines
            return False
        return header.split('|')[1] in diff_clustered_ref_loci

    def filter_query_seq(header):  # identifying query protein sequences
        return header[1:] in matched_diff_queries

    def filter_query_upstream(header):  # identifying query upstream sequences
        label = header[1:].split('|')
        label = label[0] + '|' + label[-1]
        return label in matched_diff_queries

    ref_seqs = {}  # maps ref headers ">lcl|..." to protein sequences
    ref_upstreams = {
    }  # maps ref headers "<strain>|<tag>|..." to upstream sequences
    query_seqs = {
    }  # maps query headers "<local gene>|<cluster gene>" to protein sequences
    query_upstreams = {
    }  # maps query headers "<local>|<up>|<cluster gene>" to upstream sequences

    ref_seq_dir = (ref_dir + '/ref_genomes/').replace('//', '/')
    for ref_seq_file in os.listdir(ref_seq_dir):
        if ref_seq_file[-4:] == '.faa':  # expect amino acid fasta
            ref_seq_path = ref_seq_dir + ref_seq_file
            if low_mem:  # Load only relevant sequences
                ref_seqs.update(
                    get_sequences_as_dict(ref_seq_path,
                                          select_fxn=filter_ref_seq))
            else:  # Load all sequences, faster since no filtering
                ref_seqs.update(get_sequences_as_dict(ref_seq_path))
    ref_seqs = {k.split()[0]: v for k, v in ref_seqs.iteritems()}
    log_to_file('Loaded reference sequences for genes:', len(ref_seqs))

    ref_upstream_dir = (ref_dir + '/ref_upstream/').replace('//', '/')
    for ref_upstream_file in os.listdir(ref_upstream_dir):
        if ref_upstream_file[-4:] == '.fna':  # expect nucleotide fasta
            ref_upstream_path = ref_upstream_dir + ref_upstream_file
            if low_mem:  # Load only relevant sequences
                ref_upstreams.update(
                    get_sequences_as_dict(ref_upstream_path,
                                          select_fxn=filter_ref_upstream))
            else:  # Load all sequences, faster since no filtering
                ref_upstreams.update(get_sequences_as_dict(ref_upstream_path))
    ref_upstreams = {k.split('|')[1]: v for k, v in ref_upstreams.iteritems()}
    log_to_file('Loaded reference upstream sequences for genes:',
                len(ref_upstreams))
    ''' Extract relevant query sequences '''
    query_seqs = get_sequences_as_dict(protein_file,
                                       select_fxn=filter_query_seq)
    query_seqs = {k[1:]: v for k, v in query_seqs.iteritems()}
    log_to_file('Loaded query sequences for genes:', len(query_seqs))

    query_upstreams = get_sequences_as_dict(upstream_file,
                                            select_fxn=filter_query_upstream)
    format_header = lambda x: x.split('|')[0] + '|' + x.split('|')[-1]
    query_upstreams = {
        format_header(k[1:]): v
        for k, v in query_upstreams.iteritems()
    }
    log_to_file('Loaded query upstream sequences for genes:',
                len(query_upstreams))
    ''' Report allele analysis of differential genes '''
    extreme_cases = 0
    for match_gene in sorted(expt_diff_genes.keys()):
        reported_gene = False
        query_gene = expt_diff_genes[match_gene][0]
        impacted_reactions = expt_diff_genes[match_gene][1]
        if query_gene != None:
            query_seq = query_seqs[query_gene]
            query_ups = query_upstreams[query_gene][:53]
            ''' Get allele distribution of reference gene/upstream sequences '''
            co_clustered = matched_diff_queries[query_gene]
            seq_distr = {}
            ups_distr = {}
            for ref_gene in co_clustered:
                ref_tag = header_to_label[ref_gene[1:]].split('|')[1]
                if ref_tag in ref_upstreams and ref_gene in ref_seqs:
                    # exclude rare cases where either piece of information is missing
                    ref_seq = ref_seqs[ref_gene]
                    ref_ups = ref_upstreams[ref_tag][:53]
                    if not ref_seq in seq_distr:
                        seq_distr[ref_seq] = 0
                    if not ref_ups in ups_distr:
                        ups_distr[ref_ups] = 0
                    seq_distr[ref_seq] += 1
                    ups_distr[ref_ups] += 1
            ''' Add in query sequence '''
            if not query_seq in seq_distr:
                seq_distr[query_seq] = 0
            if not query_ups in ups_distr:
                ups_distr[query_ups] = 0
            seq_distr[query_seq] += 1
            ups_distr[query_ups] += 1
            ''' Report allele distribution '''
            query_seq_count = seq_distr[query_seq]
            query_ups_count = ups_distr[query_ups]
            if rare_limit < 0 or (query_seq_count <= rare_limit
                                  and query_ups_count <= rare_limit):
                extreme_cases += 1
                if not reported_gene:
                    log_to_file('\n-------------', extreme_cases, 'GENE:',
                                match_gene, '<->', query_gene, '-------------')
                    log_to_file('\nImpacted Reactions:',
                                ', '.join(impacted_reactions), '\n')
                    reported_gene = True
                ''' If sequence is unique, compute distances to nearest sequence in cluster '''
                seq_neighbor = ''
                ups_neighbor = ''
                if query_seq_count == 1:  # query sequence is new
                    min_seq_dist = len(query_seq)
                    for seq in seq_distr:  # start with hamming distance as quick estimate
                        if seq != query_seq:
                            dist = hamming_distance(seq, query_seq)
                            if dist < min_seq_dist:
                                min_seq_dist = dist
                                seq_neighbor = seq
                    if min_seq_dist > 2:  # if hamming distance is large, compute edit distance in full
                        for seq in seq_distr:
                            if seq != query_seq:
                                dist = edit_distance(seq, query_seq)
                                if dist < min_seq_dist:
                                    min_seq_dist = dist
                                    seq_neighbor = seq
                    min_seq_dist = int(min_seq_dist)
                    log_to_file(
                        'Unique sequence, distance to nearest neighbor:',
                        min_seq_dist)

                if query_ups_count == 1:  # query upstream sequence is new
                    min_ups_dist = len(query_ups)
                    for ups in ups_distr:
                        if ups != query_ups:
                            dist = hamming_distance(ups, query_ups)
                            if dist < min_ups_dist:
                                min_ups_dist = dist
                                ups_neighbor = ups
                    if min_ups_dist > 2:  # if hamming distance is large, compute edit distance in full
                        for ups in ups_distr:
                            if ups != query_ups:
                                dist = edit_distance(ups, query_ups)
                                if dist < min_ups_dist:
                                    min_ups_dist = dist
                                    ups_neighbor = ups
                    min_ups_dist = int(min_ups_dist)
                    log_to_file(
                        'Unique upstream, distance to nearest neighbor:',
                        min_ups_dist)
                ''' Report previews of co-clustered sequences/upstreams '''
                log_to_file('\nCoding sequence distribution:')
                log_to_file('Count\tLength\tSeq')
                for seq_allele in sorted(seq_distr.keys()):
                    count = str(seq_distr[seq_allele])
                    if seq_allele == query_seq:
                        count += '*'
                    elif seq_allele == seq_neighbor:
                        count += '^'
                    log_to_file(count, '\t', len(seq_allele), '\t',
                                seq_allele[:50] + '...')

                log_to_file('\nUpstream sequence distribution:')
                log_to_file('Count\tSeq')
                for ups_allele in sorted(ups_distr.keys()):
                    count = str(ups_distr[ups_allele])
                    if ups_allele == query_ups:
                        count += '*'
                    elif ups_allele == ups_neighbor:
                        count += '^'
                    log_to_file(count, '\t', ups_allele)
예제 #23
0
    def test_folder(self, test_folder):

        for wav_file in sorted(os.listdir(test_folder)):

            # Read input test file
            wav_path = os.path.join(test_folder, wav_file)
            dump_path = wav_path[:-4] + '_pred.txt'

            # Read only wav
            if wav_file == '.DS_Store' or wav_file.split(
                    '.')[-1] != 'wav':  # or os.path.exists(dump_path):
                continue

            feat = utils.read_wav(wav_path,
                                  winlen=self.config['window_size'],
                                  winstep=self.config['window_step'],
                                  fbank_filt=self.config['n_fbank'],
                                  mfcc_filt=self.config['n_mfcc'])

            tsteps, hidden_dim = feat.shape
            # calculate log mel filterbank energies for complete file
            feat_log_full = np.reshape(feat, (1, tsteps, hidden_dim))
            lens = np.array([tsteps])
            # prepare tensors
            inputs, lens = torch.from_numpy(
                np.array(feat_log_full)).float(), torch.from_numpy(
                    np.array(lens)).long()
            id_to_phone = {v[0]: k for k, v in self.model.phone_to_id.items()}

            self.model.eval()

            with torch.no_grad():

                if self.cuda:
                    inputs = inputs.cuda()
                    lens = lens.cuda()

                # Pass through model
                outputs = self.model(inputs, lens).cpu().numpy()
                # Since only one example per batch and ignore blank token
                outputs = outputs[0]
                # softmax = np.exp(outputs) / np.sum(np.exp(outputs), axis=1)[:, None]
                # Take argmax to generate final string
                argmaxed = np.argmax(outputs, axis=1)
                # collapse according to CTC rules
                final_str = utils.collapse_frames(argmaxed,
                                                  self.model.blank_token_id)
                ans = [id_to_phone[a] for a in final_str]
                # Generate dumpable format of phone, start time and end time
                print("Predicted:", ans)

            phone_path = wav_path[:-3] + 'PHN'

            # If .PHN file exists, report edit distance
            if os.path.exists(phone_path):
                truth = utils.read_PHN_file(phone_path)
                edit_dist, ops = utils.edit_distance(truth, ans)
                print("Ground Truth:", truth, '\nEdit dsitance:', edit_dist)

                with open(dump_path, 'w') as f:
                    f.write('Predicted:\n')
                    f.write(' '.join(ans))
                    f.write('\nGround Truth:\n')
                    f.write(' '.join(truth))
                    f.write('\nEdit distance: ' + str(edit_dist))

            else:
                with open(dump_path, 'w') as f:
                    f.write('Predicted:\n')
                    f.write(' '.join(ans))
예제 #24
0
        f" [INFO] {len(predicted)} predictions decoded in {round(time.time() - start, 2)} sec. "
    )

    if result_path is not None:
        if len(fnames) != len(predicted_text):
            fnames = [
                fname for fname in bboxs for j in range(len(bboxs[fname]))
            ]
        out = pd.DataFrame({"fname": fnames, "prediction": predicted_text})
        out_name = os.path.join(result_path, "prediction.csv")
        out.to_csv(out_name)
        print(" [INFO] Prediction example: \n", predicted_text[:10])
        print(" [INFO] Result store in: ", out_name)

    if validate:
        print(" [INFO] Computing edit distance metric... ")
        start = time.time()
        true_text = [
            decoder.labels_to_text(y_true[i]) for i in range(len(y_true))
        ]
        print(" [INFO] Example pairs (predicted, true): \n",
              list(zip(predicted_text[:10], true_text[:10])))
        edit_distance_score = edit_distance(predicted_text, true_text)
        normalized_edit_distance_score = normalized_edit_distance(
            predicted_text, true_text)
        print(
            f" [INFO] edit distances calculated in {round(time.time() - start, 2)} sec. "
        )
        print(
            " [INFO] mean edit distance: %f ; normalized edit distance score: %f"
            % (edit_distance_score, normalized_edit_distance_score))
예제 #25
0
def las_model_fn(features,
                 labels,
                 mode,
                 config,
                 params,
                 binf2phone=None,
                 run_name=None):
    if tf.estimator.ModeKeys.PREDICT == mode:
        params.use_text = False

    encoder_inputs = features['encoder_inputs']
    source_sequence_length = features['source_sequence_length']

    decoder_inputs, decoder_inputs_binf = None, None
    targets = None
    target_sequence_length = None

    binf_embedding = None
    if binf2phone is not None and params.decoder.binary_outputs:
        binf_embedding = tf.constant(binf2phone,
                                     dtype=tf.float32,
                                     name='binf2phone')
    is_binf_outputs = params.decoder.binary_outputs and params.decoder.binf_sampling and (
        binf_embedding is None or mode == tf.estimator.ModeKeys.TRAIN)

    mapping = None
    if params.mapping and binf_embedding is not None:
        mapping = tf.convert_to_tensor(params.mapping)

    if mode != tf.estimator.ModeKeys.PREDICT:
        decoder_inputs = labels['targets_inputs']
        targets = labels['targets_outputs']
        if mapping is not None:
            decoder_inputs = tf.nn.embedding_lookup(mapping, decoder_inputs)
            targets = tf.nn.embedding_lookup(mapping, targets)
        target_sequence_length = labels['target_sequence_length']
        if binf_embedding is not None:
            targets_binf = tf.nn.embedding_lookup(tf.transpose(binf_embedding),
                                                  targets)
            decoder_inputs_binf = tf.nn.embedding_lookup(
                tf.transpose(binf_embedding), decoder_inputs)

    text_loss = 0
    text_edit_distance = reader_encoder_state = None
    if params.use_text:
        tf.logging.info('Building reader')

        with tf.variable_scope('reader'):
            (reader_encoder_outputs, reader_source_sequence_length
             ), reader_encoder_state = text_ae.model.reader(
                 decoder_inputs, target_sequence_length, mode, params.encoder,
                 params.decoder.target_vocab_size)

        tf.logging.info('Building writer')

        with tf.variable_scope('writer'):
            writer_decoder_outputs, writer_final_context_state, writer_final_sequence_length = text_ae.model.speller(
                reader_encoder_outputs, reader_encoder_state, decoder_inputs,
                reader_source_sequence_length, target_sequence_length, mode,
                params.decoder)

        with tf.name_scope('text_prediciton'):
            logits = writer_decoder_outputs.rnn_output
            sample_ids = tf.to_int32(tf.argmax(logits, -1))

        with tf.name_scope('text_metrics'):
            text_edit_distance = utils.edit_distance(
                sample_ids, targets, utils.EOS_ID,
                params.mapping if mapping is None else None)

            metrics = {
                'text_edit_distance': tf.metrics.mean(text_edit_distance),
            }

        tf.summary.scalar('text_edit_distance',
                          metrics['text_edit_distance'][1])

        with tf.name_scope('text_cross_entropy'):
            text_loss = compute_loss(logits, targets,
                                     writer_final_sequence_length,
                                     target_sequence_length, mode)

    tf.logging.info('Building listener')

    with tf.variable_scope('listener'):
        (encoder_outputs,
         source_sequence_length), encoder_state = las.model.listener(
             encoder_inputs, source_sequence_length, mode, params.encoder)

    tf.logging.info('Building speller')

    with tf.variable_scope('speller'):
        decoder_outputs, final_context_state, final_sequence_length = las.model.speller(
            encoder_outputs, encoder_state, decoder_inputs,
            source_sequence_length, target_sequence_length, mode,
            params.decoder)

    decoder_outputs_binf, final_context_state_binf, final_sequence_length_binf = None, None, None
    if params.decoder.binary_outputs:
        with tf.variable_scope('speller_binf'):
            decoder_outputs_binf, final_context_state_binf, final_sequence_length_binf = las.model.speller(
                encoder_outputs, encoder_state, decoder_inputs_binf,
                source_sequence_length, target_sequence_length, mode,
                params.decoder, True,
                binf_embedding if not params.decoder.binf_sampling
                or params.decoder.beam_width > 0 else None)

    sample_ids_phones_binf, sample_ids_binf, logits_binf = None, None, None
    with tf.name_scope('prediction'):
        if mode == tf.estimator.ModeKeys.PREDICT and params.decoder.beam_width > 0:
            logits = tf.no_op()
            sample_ids_phones = decoder_outputs.predicted_ids
            if decoder_outputs_binf is not None:
                sample_ids_phones_binf = decoder_outputs_binf.predicted_ids
        else:
            logits = decoder_outputs.rnn_output
            sample_ids_phones = tf.to_int32(tf.argmax(logits, -1))
            if decoder_outputs_binf is not None:
                logits_binf = decoder_outputs_binf.rnn_output
                if params.decoder.binary_outputs and params.decoder.binf_sampling:
                    sample_ids_binf = tf.to_int32(
                        tf.round(tf.sigmoid(logits_binf)))
                    logits_phones_binf = transform_binf_to_phones(
                        logits_binf, binf_embedding)
                    sample_ids_phones_binf = tf.to_int32(
                        tf.argmax(logits_phones_binf, -1))
                else:
                    sample_ids_phones_binf = tf.to_int32(
                        tf.argmax(logits_binf, -1))

    if mode == tf.estimator.ModeKeys.PREDICT:
        emb_c = tf.concat([x.c for x in encoder_state], axis=1)
        emb_h = tf.concat([x.h for x in encoder_state], axis=1)
        emb = tf.stack([emb_c, emb_h], axis=1)
        predictions = {
            'sample_ids': sample_ids_phones,
            'embedding': emb,
            'encoder_out': encoder_outputs,
            'source_length': source_sequence_length
        }
        if logits_binf is not None:
            predictions['logits_binf'] = logits_binf
        if sample_ids_phones_binf is not None:
            predictions['sample_ids_phones_binf'] = sample_ids_phones_binf

        predictions['alignment'] = get_alignment_history(
            final_context_state, params)
        if final_context_state_binf is not None:
            predictions['alignment_binf'] = get_alignment_history(
                final_context_state_binf, params)

        if params.decoder.beam_width == 0:
            if params.decoder.binary_outputs and binf_embedding is None:
                predictions['probs'] = tf.nn.sigmoid(logits_binf)
            else:
                predictions['probs'] = tf.nn.softmax(logits)

        return tf.estimator.EstimatorSpec(mode, predictions=predictions)

    metrics = None

    with tf.name_scope('metrics'):
        edit_distance = utils.edit_distance(
            sample_ids_phones, targets, utils.EOS_ID,
            params.mapping if mapping is None else None)

        metrics = {
            'edit_distance': tf.metrics.mean(edit_distance),
        }
    if params.use_text and not params.emb_loss:
        pass
    else:
        # In TRAIN model this becomes an significantly affected by early high values.
        # As a result in summaries train values would be high and drop after restart.
        # To prevent this, we use last batch average in case of TRAIN.
        if mode != tf.estimator.ModeKeys.TRAIN:
            tf.summary.scalar('edit_distance', metrics['edit_distance'][1])
        else:
            tf.summary.scalar('edit_distance', tf.reduce_mean(edit_distance))

    with tf.name_scope('cross_entropy'):
        audio_loss = compute_loss(logits, targets, final_sequence_length,
                                  target_sequence_length, mode)
    if is_binf_outputs:
        with tf.name_scope('cross_entropy_binf'):
            audio_loss_binf = compute_loss_sigmoid(logits_binf, targets_binf,
                                                   final_sequence_length,
                                                   target_sequence_length,
                                                   mode)
        audio_loss += audio_loss_binf

    emb_loss = 0
    if params.use_text:
        with tf.name_scope('embeddings_loss'):
            emb_loss = compute_emb_loss(encoder_state, reader_encoder_state)

    if mode == tf.estimator.ModeKeys.EVAL:
        with tf.name_scope('alignment'):
            attention_images = utils.create_attention_images(
                final_context_state)

        if params.use_text and not params.emb_loss:
            hooks = []
            loss = text_loss
        else:
            run_name = run_name or 'eval'
            if run_name != 'eval':
                # For other summaries eval is automatically added.
                run_name = 'eval_{}'.format(run_name)
            attention_summary = tf.summary.image('attention_images',
                                                 attention_images)
            eval_summary_hook = tf.train.SummarySaverHook(
                save_steps=20,
                output_dir=os.path.join(config.model_dir, run_name),
                summary_op=attention_summary)
            hooks = [eval_summary_hook]
            loss = audio_loss
        log_data = {
            'edit_distance': tf.reduce_mean(edit_distance),
            'max_edit_distance': tf.reduce_max(edit_distance),
            'min_edit_distance': tf.reduce_min(edit_distance)
        }
        if params.use_text:
            if not params.emb_loss:
                log_data = {}
            else:
                log_data['emb_loss'] = tf.reduce_mean(emb_loss)
            log_data['text_edit_distance'] = tf.reduce_mean(text_edit_distance)
        logging_hook = tf.train.LoggingTensorHook(log_data, every_n_iter=20)
        hooks += [logging_hook]

        return tf.estimator.EstimatorSpec(mode,
                                          loss=loss,
                                          eval_metric_ops=metrics,
                                          evaluation_hooks=hooks)

    with tf.name_scope('train'):
        optimizer = tf.train.AdamOptimizer(params.learning_rate)
        var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
        if params.use_text:
            audio_var_list = [
                x for x in var_list if not x.name.startswith('reader')
                and not x.name.startswith('writer')
            ]
            total_params = np.sum(
                [np.prod(x.shape.as_list()) for x in audio_var_list])
            tf.logging.info(
                'Trainable audio parameters: {}'.format(total_params))
            text_var_list = [
                x for x in var_list if not x.name.startswith('listener')
                and not x.name.startswith('speller')
            ]
            total_params = np.sum(
                [np.prod(x.shape.as_list()) for x in text_var_list])
            tf.logging.info(
                'Trainable text parameters: {}'.format(total_params))
            gvs = optimizer.compute_gradients(audio_loss,
                                              var_list=audio_var_list)
            capped_gvs = [(tf.clip_by_norm(grad, GRAD_NORM), var)
                          for grad, var in gvs]
            audio_train_op = optimizer.apply_gradients(
                capped_gvs, global_step=tf.train.get_global_step())
            gvs = optimizer.compute_gradients(text_loss,
                                              var_list=text_var_list)
            # No attention means that top layers won't affect anything.Thus gradients for them would be None.
            capped_gvs = [(tf.clip_by_norm(grad, GRAD_NORM), var)
                          for grad, var in gvs if grad is not None]
            text_train_op = optimizer.apply_gradients(
                capped_gvs, global_step=tf.train.get_global_step())
            gvs = optimizer.compute_gradients(emb_loss,
                                              var_list=audio_var_list)
            capped_gvs = [(tf.clip_by_norm(grad, GRAD_NORM), var)
                          for grad, var in gvs if grad is not None]
            emb_train_op = optimizer.apply_gradients(
                capped_gvs, global_step=tf.train.get_global_step())
            if not params.text_loss:
                tf.logging.info(
                    'Removing reader and writer from optimization.')
                train_op = tf.group(audio_train_op, emb_train_op)
            elif not params.emb_loss:
                tf.logging.info(
                    'Removing listener and speller from optimization params.')
                train_op = text_train_op
            else:
                raise ValueError(
                    'Either text_loss or emb_loss must be set with use_text!')
        else:
            total_params = np.sum(
                [np.prod(x.shape.as_list()) for x in var_list])
            tf.logging.info('Trainable parameters: {}'.format(total_params))

            regularizer = tf_contrib.layers.l2_regularizer(params.l2_reg_scale)
            reg_term = tf.contrib.layers.apply_regularization(
                regularizer, var_list)
            audio_loss = audio_loss + reg_term

            gvs = optimizer.compute_gradients(audio_loss, var_list=var_list)
            capped_gvs = [(tf.clip_by_norm(grad, GRAD_NORM), var)
                          for grad, var in gvs]
            train_op = optimizer.apply_gradients(
                capped_gvs, global_step=tf.train.get_global_step())
            if params.add_noise > 0:

                def add_noise():
                    noise_ops = [train_op]
                    for var in var_list:
                        if var.name.endswith('kernel:0'):
                            shape = tf.shape(var)
                            noise_op = tf.assign_add(
                                var,
                                tf.random_normal(shape,
                                                 NOISE_MEAN,
                                                 params.noise_std,
                                                 dtype=tf.float32))
                            noise_ops.append(noise_op)
                    print_op = tf.print('Adding noise to weights')
                    return tf.group(*noise_ops, print_op)

                train_op = tf.cond(
                    tf.logical_and(
                        tf.equal(
                            tf.mod(tf.train.get_global_step(),
                                   params.add_noise), 0),
                        tf.greater(tf.train.get_global_step(), 0)), add_noise,
                    lambda: train_op)

    loss = text_loss if params.use_text and not params.emb_loss else audio_loss
    train_log_data = {'loss': loss}
    if params.use_text:
        if params.emb_loss:
            train_log_data['edit_distance'] = tf.reduce_mean(edit_distance)
            train_log_data['emb_loss'] = tf.reduce_mean(emb_loss)
        train_log_data['text_edit_distance'] = tf.reduce_mean(
            text_edit_distance)
    else:
        train_log_data['edit_distance'] = tf.reduce_mean(edit_distance)
    logging_hook = tf.train.LoggingTensorHook(train_log_data, every_n_iter=10)

    return tf.estimator.EstimatorSpec(mode,
                                      loss=loss,
                                      train_op=train_op,
                                      training_hooks=[logging_hook])
예제 #26
0
    def test(self, epoch=None):

        self.model.eval()
        # edit distance of batch
        edit_dist_batch = 0
        # number of sequences
        total_phones = 0
        # decode type
        decode_type = self.config['decode_type']
        # operations dictionary for calculating probabilities
        num_ph = self.model.num_phones
        op_dict = {}
        for i in range(num_ph):
            op_dict[i] = {
                'matches': 0,
                'insertions': 0,
                'deletions': 0,
                'substitutions': np.zeros(self.model.num_phones),
                'total': 0
            }

        print("Testing...")
        print('Total batches:', len(self.test_loader))
        test_loss = 0

        num_sequences = 0
        # to_dump_probs, to_dump_labels = [], []

        with torch.no_grad():

            if self.using_custom:
                dropout_mask_reset = [True
                                      ] * (self.model.num_layers *
                                           (1 + self.config['bidirectional']))
            else:
                dropout_mask_reset = None

            while True:

                # retrieve batch from dataloader
                inputs, labels, input_lens, label_lens, status = self.test_loader.return_batch(
                    self.cuda)

                # zero the parameter gradients
                self.model.optimizer.zero_grad()

                # forward
                if self.using_custom:
                    outputs = self.model(inputs, input_lens,
                                         dropout_mask_reset)
                    dropout_mask_reset = [False] * (
                        self.model.num_layers *
                        (1 + self.config['bidirectional']))
                else:
                    outputs = self.model(inputs, input_lens)

                # calculate loss
                loss = self.model.calculate_loss(outputs, labels, input_lens,
                                                 label_lens)
                print(loss)
                test_loss += loss.item()

                outputs = outputs.cpu().numpy()
                labels = labels.cpu().numpy()

                num_sequences += outputs.shape[0]

                # calculate edit distance between ground truth and predicted sequence
                for i in range(outputs.shape[0]):
                    # predict by argmax
                    if decode_type == 'max':
                        # argmax over the phone channel
                        argmaxed = np.argmax(outputs, axis=2)
                        seq = list(argmaxed[i][:input_lens[i]])
                        # collapse neighbouring and remove blank token
                        output_seq = utils.collapse_frames(
                            seq, self.model.blank_token_id)
                    else:
                        # predict by CTC
                        outputs = utils.softmax(outputs)
                        output_seq = decode(outputs[i, :input_lens[i], :], 1,
                                            self.model.blank_token_id)[0][0]

                    # ground truth
                    gr_truth = list(labels[i][:label_lens[i]])

                    # to_dump_probs.append(outputs[i][:input_lens[i], :])
                    # to_dump_labels.append(labels[i][:label_lens[i]])

                    # calculated edit distance and required operations
                    dist, opr = utils.edit_distance(gr_truth, output_seq)

                    # increment number of phones
                    total_phones += len(gr_truth)

                    # update number of operations
                    for op_type, ids in opr.items():
                        if op_type == 'substitutions':
                            for orig, replace in ids:
                                op_dict[orig]['substitutions'][replace] += 1
                                op_dict[orig]['total'] += 1
                        else:
                            for idx in ids:
                                op_dict[idx][op_type] += 1
                                op_dict[idx]['total'] += 1

                    edit_dist_batch += dist

                if status == 1:
                    break

                print("Done with:", num_sequences, '/',
                      self.test_loader.num_egs)

        # Average out the losses and edit distance
        test_loss /= len(self.test_loader)
        edit_dist_batch /= total_phones

        print("Edit distance - %.4f %% , Loss: %.7f" %
              (edit_dist_batch * 100, test_loss))

        # Store in lists for keeping track of model performance
        self.edit_dist.append((edit_dist_batch, epoch))
        self.test_losses.append((test_loss, epoch))

        # if testing loss is minimum, store it as the 'best.pth' model, which is used for feature extraction
        # store only when doing train/test together i.e. mode is train
        # dump probabilities
        prob_insert, prob_del, prob_substi = np.zeros(num_ph), np.zeros(
            num_ph), np.zeros((num_ph, num_ph))

        if test_loss == min([x[0] for x in self.test_losses
                             ]) and self.mode == 'train':
            print("Best new model found!")
            self.model.save_model(True, epoch, self.train_losses,
                                  self.test_losses, self.edit_dist,
                                  self.arch_name)
            # Calculate the probabilities of insertion, deletion and substitution
            for ph, data in op_dict.items():
                prob_insert[ph] = data['insertions'] / data['total'] if data[
                    'total'] else 0
                prob_del[ph] = data['deletions'] / data['total'] if data[
                    'total'] else 0
                prob_substi[ph] = data['substitutions'] / data[
                    'total'] if data['total'] else 0

            # Dump best probability
            prob_dump_path = os.path.join(self.config['dir']['pickle'],
                                          self.arch_name, 'probs.pkl')
            with open(prob_dump_path, 'wb') as f:
                pickle.dump((prob_insert, prob_del, prob_substi), f)
                print("Dumped best probabilities")

        if self.mode == 'train':
            # Dump probabilities
            prob_dump_path = os.path.join(self.config['dir']['pickle'],
                                          self.arch_name,
                                          str(epoch) + '_probs.pkl')
            with open(prob_dump_path, 'wb') as f:
                pickle.dump((prob_insert, prob_del, prob_substi), f)
                print("Dumped probabilities")

        # with open('test_res.pkl', 'wb') as f:
        #     pickle.dump((to_dump_probs, to_dump_labels), f)
        self.model.train()

        return edit_dist_batch