def estimate_dist(s1, s2): ''' Estimate distance between sequences via a modified Hamming distance vs. edit distance ''' n1 = len(s1) n2 = len(s2) if n1 == n2: # same length, estimate distance as hamming distance dist = sum(c1 != c2 for c1, c2 in itertools.izip(seq1, seq2)) print dist if dist > dist_limit: # hamming distance large, try again with edit dist dist = edit_distance(seq1, seq2, limit=dist_limit) else: # different length, compute edit distance dist = edit_distance(seq1, seq2) return dist
def exp_comb_1(): ''' Levenshtein distance we do not consider time information-align words ''' mfl_1_parent_dir = 'plp/am/plp-bg' mfl_2_parent_dir = 'plp/am/plp-bg' mlf_1_pass = '******' mlf_2_pass = '******' episode = 'dev03_DEV001-20010117-XX2000' mlf1 = mfl_1_parent_dir + '/' +episode+ '/' + mlf_1_pass + '/rescore.mlf' mlf2 = mfl_2_parent_dir + '/' +episode+ '/' + mlf_2_pass + '/rescore.mlf' utl.edit_distance(mlf1, mlf2)
def main(args): non_lang_syms = [] if args.non_lang_syms is not None: with open(args.non_lang_syms, 'r', encoding='utf-8') as f: non_lang_syms = [x.rstrip() for x in f.readlines()] word_filters = [] if args.wer_output_filter is not None: with open(args.wer_output_filter, 'r', encoding='utf-8') as f: for line in f: line = line.strip() if line.startswith('#!') or line == '': continue elif line.startswith('s/'): m = re.match(r's/(\S+)/(\w*)/g', line) assert m is not None word_filters.append([m.group(1), m.group(2)]) elif line.startswith('s:'): m = re.match(r's:(\S+):(\w*):g', line) assert m is not None word_filters.append([m.group(1), m.group(2)]) else: print('Unsupported pattern: "{}", ignored'.format(line), file=sys.stderr) refs = {} with open(args.ref_text, 'r', encoding='utf-8') as f: for line in f: utt_id, text = line.strip().split(None, 1) assert utt_id not in refs, utt_id refs[utt_id] = text wer_counter = Counter() with open(args.hyp_text, 'r', encoding='utf-8') as f: for line in f: utt_id, text = line.strip().split(None, 1) assert utt_id in refs, utt_id ref, hyp = refs[utt_id], text # filter words according to word_filters (support re.sub only) for pattern, repl in word_filters: ref = re.sub(pattern, repl, ref) hyp = re.sub(pattern, repl, hyp) # filter out any non_lang_syms from ref and hyp ref_list = [x for x in ref.split() if x not in non_lang_syms] hyp_list = [x for x in hyp.split() if x not in non_lang_syms] _, _, counter = edit_distance(ref_list, hyp_list) wer_counter += counter assert wer_counter['words'] > 0 wer = float(wer_counter['sub'] + wer_counter['ins'] + \ wer_counter['del']) / wer_counter['words'] * 100 sub = float(wer_counter['sub']) / wer_counter['words'] * 100 ins = float(wer_counter['ins']) / wer_counter['words'] * 100 dlt = float(wer_counter['del']) / wer_counter['words'] * 100 print('WER={:.2f}%, Sub={:.2f}%, Ins={:.2f}%, Del={:.2f}%, #words={:d}'. format(wer, sub, ins, dlt, wer_counter['words']))
def get_similar(self, word, tolerance): similar = [] for wd in self.words: distance = edit_distance(wd, word) if distance <= tolerance: similar.append(wd) return similar
def recursive_get_similar(tree, index): distance = edit_distance(tree[index][0], word) if distance <= tolerance: similar.append(tree[index][0]) for i in range(distance - tolerance, distance + tolerance + 1): if i in tree[index][1]: recursive_get_similar(tree, tree[index][1][i])
def recursive_add(tree, index): if tree[index][0] == '': tree[index][0] = word else: distance = edit_distance(tree[index][0], word) if distance != 0: if distance not in tree[index][1]: tree[index][1][distance] = len(tree) tree.append(['', {}]) recursive_add(tree, tree[index][1][distance])
def equal_queries(self, queries1, queries2): n = len(queries1) if n != len(queries2): return False for i in xrange(n): if utils.edit_distance( queries1[i]['raw'], queries2[i]['raw'], EDIT_DISTANCE_THRESHOLD) > EDIT_DISTANCE_THRESHOLD: return False return True
def add(self, word): index = 0 while self.tree[index][0] != '': distance = edit_distance(self.tree[index][0], word) if distance == 0: return if distance not in self.tree[index][1]: self.tree[index][1][distance] = len(self.tree) self.tree.append(['', {}]) index = self.tree[index][1][distance] self.tree[index][0] = word
def get_similar(self, word, tolerance): similar, st = [], [0] while st: index = st.pop() distance = edit_distance(self.tree[index][0], word) if distance <= tolerance: similar.append(self.tree[index][0]) for i in range(distance - tolerance, distance + tolerance + 1): if i in self.tree[index][1]: st.append(self.tree[index][1][i]) return similar
def match_address(self): """ Match visitors address fields. :return score: calculated address similarity score based on all attributes """ new_address = self.new_v["visitor_addresses"] match_scores = [] for prev_v in self.prev_vs: prev_address = prev_v["visitor_addresses"] results = { "Line1": 1.0 - edit_distance(prev_address["Line1"], new_address["Line1"]), "Line2": 1.0 - edit_distance(prev_address["Line2"], new_address["Line2"]), "City": exact_match(prev_address["City"], new_address["City"]), "Country": exact_match(prev_address["Country"], new_address["Country"]), "Postal_code": exact_match(prev_address["Postal_code"], new_address["Postal_code"]), "State": exact_match(prev_address["State"], new_address["State"]), } match_scores.append(generate_match_score(results, self.weights["visitor_addresses"])) return max(match_scores)
def _get_reward(self, offset=3): golden_standard_db = self.golden_standard_db data_cur = [] if golden_standard_db[0][0] is None: print("THE GOLD STANDARD IS MORE LIKE SILVER...[?] HMMM") #print(self.current_data) #print(self.golden_standard_db) try: sys.exit(-1) except SystemExit: os._exit(-2) else: tmp = golden_standard_db[0][0].lower().replace(' ', '') golden_standard_db = [(tmp, golden_standard_db[0][1])] """ data_cur.append((tup[0][0].lower().replace(' ', ''), tup[0][1]))AttributeError: 'spacy.tokens.span.Span' object has no attribute 'lower' """ for tup in self.current_db: data_cur.append((str(tup[0][0]).lower().replace(' ', ''), tup[0][1])) a = set(golden_standard_db) if len(a) == 0: print("Well josue, the world is weird") try: print("ERROR IN THE FUNCTION _get_reward()") sys.exit(-1) except SystemExit: os._exit(-2) # TODO: PA: it shouldn't be the extracted NER from the snippet in self.current_data ? b = set(data_cur) # Jaccard index - penalty # penalty = e^(alpha * len(b)) * u(len(b)-offset) + min (edit_distance(A,B)) / len(A_content) edit_vect = np.array(utils.edit_distance(a, b)) # Range: [0, inf) penalty = m.pow( m.e, self.alpha_reward * len(b)) * utils.step(len(b) - offset) penalty += edit_vect.mean() / utils.len_content(a) reward_cur = (len(a.intersection(b)) / len(a.union(b))) - penalty reward = reward_cur - self.reward_prev self.reward_prev = reward_cur return reward
def _correct(self, line): ''' 将编辑距离小于阈值的词进行替换 ''' flag = False for word in constants.CORRECT_WORDS: word_pinyin = ''.join(lazy_pinyin(word)) segged_words = self._seg_sentence(word, line) for w in segged_words: w_pinyin = ''.join(lazy_pinyin(w)) if utils.edit_distance(w_pinyin, word_pinyin) < constants.DISTANCE: line = line.replace(w, constants.RIGHT_WORD) flag = True break if flag: break return line
def parse_database(self) -> dict(): self._model = dict() for table_name in tqdm(self.table_names): df = self.read_table(table_name) self._model[table_name] = {} col_types = [ df.iloc[0, index].__class__.__name__ for index in range(len(df.columns)) ] smallest_pk_edit_distance = float("inf") previous_pk_field = None for field_name, field_type in zip(df.columns, col_types): self._model[table_name][field_name] = {} self._model[table_name][field_name]["type"] = field_type # extract primary keys if field_name[:3] == "pk_": # possible primary key distance = edit_distance(field_name[3:-3], table_name) if distance < smallest_pk_edit_distance: # closest primary key so far if previous_pk_field is not None: # remove previously assumed pk self._model[table_name][previous_pk_field]["pk"] = False # set current pk self._model[table_name][field_name]["pk"] = True previous_pk_field = field_name smallest_pk_edit_distance = distance else: self._model[table_name][field_name]["pk"] = False else: self._model[table_name][field_name]["pk"] = False # extract foreign keys if field_name[:3] == "fk_": self._model[table_name][field_name]["fk"] = True self._model[table_name][field_name][ "fk_table" ] = self.get_tablename_from_fieldname(field_name) else: self._model[table_name][field_name]["fk"] = False return self._model
def get_pk_and_fk_from_table(fmp: FMP, df: pd.DataFrame, table_name: str): pk = None fks = [] smallest_pk_edit_distance = float("inf") for field_name in df.columns: # extract primary keys if field_name[:3] == "pk_" and field_name[-3:] == "_id": # possible primary key distance = edit_distance(field_name[3:-3], table_name) if distance < smallest_pk_edit_distance: pk = field_name smallest_pk_edit_distance = distance # extract foreign keys if field_name[:3] == "fk_" and field_name[-3:] == "_id": fks.append((field_name, fmp.get_tablename_from_fieldname(field_name))) return pk, fks
def handle_category(self, cat, cht): ''' Sees whether the category written already exists And prompts the user for the action to take if it doesn't. :param cat: the relevant category :param cht: the relevant cheatsheet :return: the destined name of the category, boolean to indicate whether an existing category was chosen or not ''' if not cat[0].isupper(): cat = cat[0].upper() + cat[1:] if cat in cht.keys(): # category has been found and we can move on pass else: # recommend most similar categories or choose new one keys = [k for k in cht.keys() if k not in ["START", "END"]] recommendations = sorted([(i, utils.edit_distance(k, cat)) for i, k in enumerate(keys)], key=itemgetter(1)) self.add_msg("Category " + cat + " not recognized. Choose one of following:") self.add_msg("-1 = add new category") for i, rec in recommendations[:min(3, len(keys))]: self.add_msg(str(i) + " = " + keys[i]) inpt = self.get_input('Choice') inpt = int(inpt) if inpt == -1: return cat, False else: if inpt >= 0 and inpt < len(keys): cat = keys[inpt] else: self.add_msg("Invalid choice. Exiting program") exit() return cat, True
def suggest_word(self, token: str): bigrams_token = get_bigrams(token) possible_similar_words = set() for bigram in bigrams_token: possible_similar_words = possible_similar_words.union( self.bigram_index.index[bigram]) jaccard_sims = [] for word in possible_similar_words: jaccard_sims.append((word, jaccard_similarity(set(bigrams_token), set(get_bigrams(word))))) # sorting the possibly similar words based on their jaccard distance to the main token jaccard_sims = sorted(jaccard_sims, key=lambda x: x[1], reverse=True) similar_words = jaccard_sims[: 5] # similar words with their jaccard distance to the main token distances = [(t[0], edit_distance(token, t[0])) for t in similar_words] distances = sorted(distances, key=lambda x: x[1]) correct_word = distances[0][0] return correct_word
val_ed = 0 val_len = 0 val_count = 0 while val_idx < validation_size: mini_batch_feed_dict = { inputs: validation_batch['inputs'][val_idx:val_idx+params['batch_size']], seq_len: validation_batch['seq_lengths'][val_idx:val_idx+params['batch_size']], rnn_keep_prob: 1.0 } prediction = sess.run(decoded, mini_batch_feed_dict) str_predictions = utils.sparse_tensor_to_strs(prediction) for i in range(len(str_predictions)): ed = utils.edit_distance(str_predictions[i], validation_batch['targets'][val_idx+i]) val_ed = val_ed + ed val_len = val_len + len(validation_batch['targets'][val_idx+i]) val_count = val_count + 1 val_idx = val_idx + params['batch_size'] print ('[Epoch ' + str(epoch) + '] ' + str(1. * val_ed / val_count) + ' (' + str(100. * val_ed / val_len) + ' SER) from ' + str(val_count) + ' samples.') print ('Saving the model...') saver.save(sess,args.save_model,global_step=epoch) print ('------------------------------')
def las_model_fn(features, labels, mode, config, params, binf2phone=None, run_name=None): encoder_inputs = features['encoder_inputs'] source_sequence_length = features['source_sequence_length'] decoder_inputs, decoder_inputs_binf = None, None targets = None target_sequence_length = None targets_binf = None binf_embedding = None if binf2phone is not None and params.decoder.binary_outputs: binf_embedding = tf.constant(binf2phone, dtype=tf.float32, name='binf2phone') mapping = None if params.mapping and binf_embedding is not None: mapping = tf.convert_to_tensor(params.mapping) if mode != tf.estimator.ModeKeys.PREDICT: decoder_inputs = labels['targets_inputs'] targets = labels['targets_outputs'] if mapping is not None: decoder_inputs = tf.nn.embedding_lookup(mapping, decoder_inputs) targets = tf.nn.embedding_lookup(mapping, targets) target_sequence_length = labels['target_sequence_length'] if binf_embedding is not None: targets_binf = tf.nn.embedding_lookup(tf.transpose(binf_embedding), targets) decoder_inputs_binf = tf.nn.embedding_lookup( tf.transpose(binf_embedding), decoder_inputs) tf.logging.info('Building listener') with tf.variable_scope('listener'): (encoder_outputs, source_sequence_length), encoder_state = las.model.listener( encoder_inputs, source_sequence_length, mode, params.encoder) tf.logging.info('Building speller') decoder_outputs, final_context_state, final_sequence_length = None, None, None if not params.decoder.binary_outputs or params.decoder.multitask: with tf.variable_scope('speller'): decoder_outputs, final_context_state, final_sequence_length = las.model.speller( encoder_outputs, encoder_state, decoder_inputs, source_sequence_length, target_sequence_length, mode, params.decoder) decoder_outputs_binf, final_context_state_binf, final_sequence_length_binf = None, None, None if params.decoder.binary_outputs: with tf.variable_scope('speller_binf'): decoder_outputs_binf, final_context_state_binf, final_sequence_length_binf = las.model.speller( encoder_outputs, encoder_state, decoder_inputs_binf if not params.decoder.binf_projection else decoder_inputs, source_sequence_length, target_sequence_length, mode, params.decoder, not params.decoder.binf_projection, binf_embedding if not params.decoder.binf_sampling or params.decoder.beam_width > 0 else None) sample_ids_phones_binf, sample_ids_phones, sample_ids_binf, logits_binf, logits = None, None, None, None, None with tf.name_scope('prediction'): if mode == tf.estimator.ModeKeys.PREDICT and params.decoder.beam_width > 0: logits = tf.no_op() if decoder_outputs is not None: sample_ids_phones = decoder_outputs.predicted_ids if decoder_outputs_binf is not None: sample_ids_phones_binf = decoder_outputs_binf.predicted_ids else: if decoder_outputs is not None: logits = decoder_outputs.rnn_output sample_ids_phones = tf.to_int32(tf.argmax(logits, -1)) if decoder_outputs_binf is not None: logits_binf = decoder_outputs_binf.rnn_output if params.decoder.binary_outputs and params.decoder.binf_sampling: logits_phones_binf = transform_binf_to_phones( logits_binf, binf_embedding) sample_ids_phones_binf = tf.to_int32( tf.argmax(logits_phones_binf, -1)) else: sample_ids_phones_binf = tf.to_int32( tf.argmax(logits_binf, -1)) if mode == tf.estimator.ModeKeys.PREDICT: emb_c = tf.concat([x.c for x in encoder_state], axis=1) emb_h = tf.concat([x.h for x in encoder_state], axis=1) emb = tf.stack([emb_c, emb_h], axis=1) predictions = { 'embedding': emb, 'encoder_out': encoder_outputs, 'source_length': source_sequence_length } if sample_ids_phones is not None: predictions['sample_ids'] = sample_ids_phones if logits_binf is not None: predictions['logits_binf'] = logits_binf if sample_ids_phones_binf is not None: predictions['sample_ids_phones_binf'] = sample_ids_phones_binf if final_context_state is not None: predictions['alignment'] = get_alignment_history( final_context_state, params) if final_context_state_binf is not None: predictions['alignment_binf'] = get_alignment_history( final_context_state_binf, params) if params.decoder.beam_width == 0: if params.decoder.binary_outputs and binf_embedding is None: predictions['probs'] = tf.nn.sigmoid(logits_binf) elif logits is not None: predictions['probs'] = tf.nn.softmax(logits) else: predictions['probs'] = tf.nn.softmax(logits_binf) return tf.estimator.EstimatorSpec(mode, predictions=predictions) edit_distance, edit_distance_binf = None, None with tf.name_scope('metrics'): if sample_ids_phones is not None: edit_distance = utils.edit_distance( sample_ids_phones, targets, utils.EOS_ID, params.mapping if mapping is None else None) if sample_ids_phones_binf is not None: edit_distance_binf = utils.edit_distance( sample_ids_phones_binf, targets, utils.EOS_ID, params.mapping if mapping is None else None) metrics = { 'edit_distance': tf.metrics.mean(edit_distance if edit_distance is not None else edit_distance_binf), } # In TRAIN model this becomes an significantly affected by early high values. # As a result in summaries train values would be high and drop after restart. # To prevent this, we use last batch average in case of TRAIN. if mode != tf.estimator.ModeKeys.TRAIN: tf.summary.scalar('edit_distance', metrics['edit_distance'][1]) elif not params.tpu_name: tf.summary.scalar( 'edit_distance', tf.reduce_mean(edit_distance if edit_distance is not None else edit_distance_binf)) audio_loss_ipa, audio_loss_binf = None, None if logits is not None: with tf.name_scope('cross_entropy'): audio_loss_ipa = compute_loss(logits, targets, final_sequence_length, target_sequence_length, mode) if logits_binf is not None: with tf.name_scope('cross_entropy_binf'): if params.decoder.binf_projection: audio_loss_binf = compute_loss(logits_binf, targets, final_sequence_length_binf, target_sequence_length, mode) else: if mode == tf.estimator.ModeKeys.TRAIN: audio_loss_binf = compute_loss_sigmoid( logits_binf, targets_binf, final_sequence_length_binf, target_sequence_length, mode) else: audio_loss_binf = compute_loss_sigmoid( logits_binf, targets, final_sequence_length_binf, target_sequence_length, mode) audio_loss = 0 if audio_loss_ipa is not None: audio_loss += audio_loss_ipa if audio_loss_binf is not None: audio_loss += audio_loss_binf tf.summary.scalar('audio_loss_binf', audio_loss_binf) ctc_edit_distance = None if params.ctc_weight > 0: ctc_logits = tf.layers.dense(encoder_outputs, params.decoder.target_vocab_size + 1, activation=None, name='ctc_logits') decoded_ctc, _ = tf.nn.ctc_greedy_decoder( tf.transpose(ctc_logits, [1, 0, 2]), source_sequence_length) decoded_ctc = tf.sparse.to_dense(decoded_ctc[0]) decoded_ctc = tf.cast(decoded_ctc, tf.int32) if target_sequence_length is not None: ctc_loss = tf.nn.ctc_loss_v2(labels=targets, logits=ctc_logits, logits_time_major=False, label_length=target_sequence_length, logit_length=source_sequence_length) ctc_loss = tf.reduce_mean(ctc_loss, name='ctc_phone_loss') audio_loss += ctc_loss * params.ctc_weight tf.summary.scalar('ctc_loss', ctc_loss) with tf.name_scope('ctc_metrics'): ctc_edit_distance = utils.edit_distance( decoded_ctc, targets, utils.EOS_ID, params.mapping if mapping is None else None) metrics['ctc_edit_distance'] = tf.metrics.mean( ctc_edit_distance) if mode != tf.estimator.ModeKeys.TRAIN: tf.summary.scalar('ctc_edit_distance', metrics['ctc_edit_distance'][1]) else: tf.summary.scalar('ctc_edit_distance', tf.reduce_mean(ctc_edit_distance)) if mode == tf.estimator.ModeKeys.EVAL: with tf.name_scope('alignment'): attention_images = utils.create_attention_images( final_context_state or final_context_state_binf) run_name = run_name or 'eval' if run_name != 'eval': # For other summaries eval is automatically added. run_name = 'eval_{}'.format(run_name) attention_summary = tf.summary.image('attention_images', attention_images) eval_summary_hook = tf.train.SummarySaverHook( save_steps=20, output_dir=os.path.join(config.model_dir, run_name), summary_op=attention_summary) hooks = [eval_summary_hook] loss = audio_loss log_data = { 'edit_distance': tf.reduce_mean(edit_distance if edit_distance is not None else edit_distance_binf), 'max_edit_distance': tf.reduce_max(edit_distance if edit_distance is not None else edit_distance_binf), 'min_edit_distance': tf.reduce_min(edit_distance if edit_distance is not None else edit_distance_binf) } logging_hook = tf.train.LoggingTensorHook(log_data, every_n_iter=20) hooks += [logging_hook] return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=metrics, evaluation_hooks=hooks) with tf.name_scope('train'): optimizer = tf.train.AdamOptimizer(params.learning_rate) if params.tpu_name and params.tpu_name != 'fake': optimizer = tf.tpu.CrossShardOptimizer(optimizer) var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) total_params = np.sum([np.prod(x.shape.as_list()) for x in var_list]) tf.logging.info('Trainable parameters: {}'.format(total_params)) regularizer = tf_contrib.layers.l2_regularizer(params.l2_reg_scale) reg_term = tf.contrib.layers.apply_regularization( regularizer, var_list) audio_loss = audio_loss + reg_term gvs = optimizer.compute_gradients(audio_loss, var_list=var_list) capped_gvs = [(tf.clip_by_norm(grad, GRAD_NORM), var) for grad, var in gvs] train_op = optimizer.apply_gradients( capped_gvs, global_step=tf.train.get_global_step()) if params.add_noise > 0: def add_noise(): noise_ops = [train_op] for var in var_list: if var.name.endswith('kernel:0'): shape = tf.shape(var) noise_op = tf.assign_add( var, tf.random_normal(shape, NOISE_MEAN, params.noise_std, dtype=tf.float32)) noise_ops.append(noise_op) print_op = tf.print('Adding noise to weights') return tf.group(*noise_ops, print_op) train_op = tf.cond( tf.logical_and( tf.equal( tf.mod(tf.train.get_global_step(), params.add_noise), 0), tf.greater(tf.train.get_global_step(), 0)), add_noise, lambda: train_op) loss = audio_loss train_log_data = { 'loss': loss, 'edit_distance': tf.reduce_mean( edit_distance if edit_distance is not None else edit_distance_binf) } if ctc_edit_distance is not None: train_log_data['ctc_edit_distance'] = tf.reduce_mean(ctc_edit_distance) logging_hook = tf.train.LoggingTensorHook(train_log_data, every_n_iter=10) if not params.tpu_name: return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op, training_hooks=[logging_hook]) else: return tf.estimator.tpu.TPUEstimatorSpec(mode, loss=loss, train_op=train_op)
return process_name, process_status, process_parent_name, process_parent_pid except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): pass if __name__ == "__main__": #print find_process_parent_info('firefox') #print find_process_by_name('firefox') #print check_process_status('firefox') # get the process names critical_process_list = list() for proc in psutil.process_iter(): p_name = proc.name() critical_process_list.append(p_name) # loop through the list and print #for entry in critical_process_list: # print entry # find out list of critical linux process ( See CIS benchmarks, NIST guides) for entry in critical_process_list: for proc in psutil.process_iter(): p_name = proc.name() process_distance = utils.edit_distance(p_name.lower(), entry) if process_distance < 5: print entry, p_name, process_distance
sample_img.shape[1], 1) # LENGTH length = [batch_image.shape[2] / WIDTH_REDUCTION] # PREDICTION prediction = sess.run(decoded, { inputs: batch_image, seq_len: length, rnn_keep_prob: 1.0 }) str_predictions = utils.sparse_tensor_to_strs(prediction) # EVALUATION ed = utils.edit_distance(str_predictions[0], label) if ed != 0: val_err = val_err + 1 val_ed = val_ed + ed val_len = val_len + len(label) val_count = val_count + 1 # Counter val_idx = val_idx + 1 print('Samples: ' + str(val_count)) print('Acc Err: ' + str(val_err) + ' (Avg. Err: ' + str(1. * val_err / val_count) + ')') print('Acc Ed: ' + str(val_ed) + ' (Avg. Ed: ' + str(1. * val_ed / val_count) + ')') print('SER: ' + str(100. * val_ed / val_len))
def las_model_fn(features, labels, mode, config, params): encoder_inputs = features['encoder_inputs'] source_sequence_length = features['source_sequence_length'] decoder_inputs = None targets = None target_sequence_length = None if mode != tf.estimator.ModeKeys.PREDICT: decoder_inputs = labels['targets_inputs'] targets = labels['targets_outputs'] target_sequence_length = labels['target_sequence_length'] tf.logging.info('Building listener') with tf.variable_scope('listener'): (encoder_outputs, source_sequence_length), encoder_state = las.model.listener( encoder_inputs, source_sequence_length, mode, params.encoder) tf.logging.info('Building speller') with tf.variable_scope('speller'): decoder_outputs, final_context_state, final_sequence_length = las.model.speller( encoder_outputs, encoder_state, decoder_inputs, source_sequence_length, target_sequence_length, mode, params.decoder) with tf.name_scope('prediction'): if mode == tf.estimator.ModeKeys.PREDICT and params.decoder.beam_width > 0: logits = tf.no_op() sample_ids = decoder_outputs.predicted_ids else: logits = decoder_outputs.rnn_output sample_ids = tf.to_int32(tf.argmax(logits, -1)) if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'sample_ids': sample_ids, } return tf.estimator.EstimatorSpec(mode, predictions=predictions) with tf.name_scope('metrics'): edit_distance = utils.edit_distance(sample_ids, targets, utils.EOS_ID, params.mapping) metrics = { 'edit_distance': tf.metrics.mean(edit_distance), } tf.summary.scalar('edit_distance', metrics['edit_distance'][1]) with tf.name_scope('cross_entropy'): loss = compute_loss(logits, targets, final_sequence_length, target_sequence_length, mode) if mode == tf.estimator.ModeKeys.EVAL: with tf.name_scope('alignment'): attention_images = utils.create_attention_images( final_context_state) attention_summary = tf.summary.image('attention_images', attention_images) eval_summary_hook = tf.train.SummarySaverHook( save_steps=10, output_dir=os.path.join(config.model_dir, 'eval'), summary_op=attention_summary) logging_hook = tf.train.LoggingTensorHook( { 'edit_distance': tf.reduce_mean(edit_distance), 'max_edit_distance': tf.reduce_max(edit_distance), 'max_predictions': sample_ids[tf.argmax(edit_distance)], 'max_targets': targets[tf.argmax(edit_distance)], 'min_edit_distance': tf.reduce_min(edit_distance), 'min_predictions': sample_ids[tf.argmin(edit_distance)], 'min_targets': targets[tf.argmin(edit_distance)], }, every_n_iter=10) return tf.estimator.EstimatorSpec( mode, loss=loss, eval_metric_ops=metrics, evaluation_hooks=[logging_hook, eval_summary_hook]) with tf.name_scope('train'): optimizer = tf.train.AdamOptimizer(params.learning_rate) train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step()) logging_hook = tf.train.LoggingTensorHook( { 'loss': loss, 'edit_distance': tf.reduce_mean(edit_distance), #'max_edit_distance': tf.reduce_max(edit_distance), #'predictions': sample_ids[tf.argmax(edit_distance)], #'targets': targets[tf.argmax(edit_distance)], }, every_n_secs=10) return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op, training_hooks=[logging_hook])
def allele_analysis_differential_carbon_source(query_dir, expt_source, control_source='glc__D', ref_dir='reference/', rare_limit=-1, low_mem=True, write_log=False): ''' Runs the following analysis for a given model 1) Simulate on an experimental carbon source and a control carbon source 2) Identify reactions active only in the experimental conditions 3) Identify genes associated with the differential reactions 4) Compare upstream and coding sequences of those genes to reference strains Expects there to be four files in query-dir: <strain>.faa, <strain>.json, <strain>_cdhit_merged.faa.clstr, and <strain>_upstream.fna. If expt_source=None, does not do a differential analysis but instead analyzes all genes as in steps 3/4. Parameters ---------- query_dir : str Directory with model, upstream, and CD-Hit cluster files. expt_source : str Metabolite ID of carbon source to compare against control carbon source. Alternatively, if a list of reactionIDs are provided, uses those instead of attempting to identify differentially active reactions. Alternatively again, if None, will examine all reactions/genes. control_source : str Metabolite ID of carbon source to use as baseline (default glc__D) ref_dir : str Directory with reference materials, refer to recon.ref (default reference/) rare_limit : int If positive, only reports model alleles that have been observed at most rare_limit times in the cluster file, i.e. if rare_limit = 1, only reports alleles that have never been observed among reference strains. Reports all if negative (default -1). low_mem : bool If True, only stores relevant sequences from reference genomes in memory by filtering by header; runs notably slower (default True) write_log : bool If True, saves/overwrites the allele report file in query_dir (default False) ''' ''' Load files relevant to the query ''' for filename in os.listdir(query_dir): if filename[-4:] == '.faa': strain = filename[:-4] log_file = query_dir + '/' + strain + '_allele_report.txt' protein_file = query_dir + '/' + strain + '.faa' model_file = query_dir + '/' + strain + '.json' upstream_file = query_dir + '/' + strain + '_upstream.fna' cluster_file = query_dir + '/' + strain + '_cdhit_merged.faa.clstr' for filename in [protein_file, model_file, upstream_file, cluster_file]: if not os.path.exists(filename): print 'FILE IS MISSING, ABORTING:', filename return ''' Prepare output file ''' if write_log: log_f = open(log_file, 'w+') def log_to_file(*argv): line = ' '.join(map(str, argv)) print line # print to console first if write_log: log_f.write(line + '\n') # write to file ''' Extract all co-clustered reference genes ''' def query_fxn(feature_name): return feature_name[:5] != '>lcl|' query_to_cluster = get_co_clustered(cluster_file, query_fxn) log_to_file('Loaded query clusters:', len(query_to_cluster)) ''' Simulate differential growth ''' model = cobra.io.load_json_model(model_file) if type(expt_source ) == str: # analyzing differential genes WRT carbon source expt_diff_genes, expt_diff_rxns = get_differential_reactions( model, expt_source, control_source) # for i in range(iters - 1): # diff_genes = get_differential_reactions(model, expt_source, control_source) # for gene in expt_diff_genes.keys(): # only records genes that are differential across multiple runs # if not gene in diff_genes: # print 'Marginal diff:', gene # del expt_diff_genes[gene] log_to_file('Found differentially active reactions:', len(expt_diff_rxns)) log_to_file(sorted(expt_diff_rxns)) log_to_file('Found differentially active genes:', len(expt_diff_genes)) log_to_file(sorted(expt_diff_genes.keys())) elif type(expt_source) in [ list, set, tuple ]: # pre-determined set of reactions (need a better way to check iterables) expt_diff_genes = get_gene_mapping_for_reactions(model, expt_source) log_to_file('Found differentially active genes:', len(expt_diff_genes)) log_to_file(sorted(expt_diff_genes.keys())) else: # analyzing all genes expt_diff_genes = get_gene_mapping_for_reactions( model, map(lambda x: x.id, model.reactions)) log_to_file('Analyzing all genes:', len(expt_diff_genes)) ''' Extract locus tags for relevant reference genes ''' header_to_label = {} # map raw headers (without ">") to labels with open(ref_dir + '/ref_labels.tsv', 'r') as f: for line in f: name, label = line.split() header_to_label[name] = label matched_diff_queries = { } # query_to_cluster reduced to just differentially active genes diff_clustered_ref_names = set( ) # set of all reference genes co-clustered with a differential gene for match_gene in expt_diff_genes: query_gene = expt_diff_genes[match_gene][0] matched_diff_queries[query_gene] = query_to_cluster['>' + query_gene] diff_clustered_ref_names = diff_clustered_ref_names.union( query_to_cluster['>' + query_gene]) del query_to_cluster diff_clustered_ref_loci = map( lambda x: header_to_label[x[1:]].split('|')[1], diff_clustered_ref_names) log_to_file('Found co-clustered reference genes:', len(diff_clustered_ref_loci)) ''' Extract relevant sequences from reference and query files ''' def filter_ref_seq(header): # identifying reference protein sequences if len(header.strip()) == 0: # catch empty lines return False return header.split()[0] in diff_clustered_ref_names def filter_ref_upstream( header): # identifying reference upstream sequences if len(header.strip()) == 0: # catch empty lines return False return header.split('|')[1] in diff_clustered_ref_loci def filter_query_seq(header): # identifying query protein sequences return header[1:] in matched_diff_queries def filter_query_upstream(header): # identifying query upstream sequences label = header[1:].split('|') label = label[0] + '|' + label[-1] return label in matched_diff_queries ref_seqs = {} # maps ref headers ">lcl|..." to protein sequences ref_upstreams = { } # maps ref headers "<strain>|<tag>|..." to upstream sequences query_seqs = { } # maps query headers "<local gene>|<cluster gene>" to protein sequences query_upstreams = { } # maps query headers "<local>|<up>|<cluster gene>" to upstream sequences ref_seq_dir = (ref_dir + '/ref_genomes/').replace('//', '/') for ref_seq_file in os.listdir(ref_seq_dir): if ref_seq_file[-4:] == '.faa': # expect amino acid fasta ref_seq_path = ref_seq_dir + ref_seq_file if low_mem: # Load only relevant sequences ref_seqs.update( get_sequences_as_dict(ref_seq_path, select_fxn=filter_ref_seq)) else: # Load all sequences, faster since no filtering ref_seqs.update(get_sequences_as_dict(ref_seq_path)) ref_seqs = {k.split()[0]: v for k, v in ref_seqs.iteritems()} log_to_file('Loaded reference sequences for genes:', len(ref_seqs)) ref_upstream_dir = (ref_dir + '/ref_upstream/').replace('//', '/') for ref_upstream_file in os.listdir(ref_upstream_dir): if ref_upstream_file[-4:] == '.fna': # expect nucleotide fasta ref_upstream_path = ref_upstream_dir + ref_upstream_file if low_mem: # Load only relevant sequences ref_upstreams.update( get_sequences_as_dict(ref_upstream_path, select_fxn=filter_ref_upstream)) else: # Load all sequences, faster since no filtering ref_upstreams.update(get_sequences_as_dict(ref_upstream_path)) ref_upstreams = {k.split('|')[1]: v for k, v in ref_upstreams.iteritems()} log_to_file('Loaded reference upstream sequences for genes:', len(ref_upstreams)) ''' Extract relevant query sequences ''' query_seqs = get_sequences_as_dict(protein_file, select_fxn=filter_query_seq) query_seqs = {k[1:]: v for k, v in query_seqs.iteritems()} log_to_file('Loaded query sequences for genes:', len(query_seqs)) query_upstreams = get_sequences_as_dict(upstream_file, select_fxn=filter_query_upstream) format_header = lambda x: x.split('|')[0] + '|' + x.split('|')[-1] query_upstreams = { format_header(k[1:]): v for k, v in query_upstreams.iteritems() } log_to_file('Loaded query upstream sequences for genes:', len(query_upstreams)) ''' Report allele analysis of differential genes ''' extreme_cases = 0 for match_gene in sorted(expt_diff_genes.keys()): reported_gene = False query_gene = expt_diff_genes[match_gene][0] impacted_reactions = expt_diff_genes[match_gene][1] if query_gene != None: query_seq = query_seqs[query_gene] query_ups = query_upstreams[query_gene][:53] ''' Get allele distribution of reference gene/upstream sequences ''' co_clustered = matched_diff_queries[query_gene] seq_distr = {} ups_distr = {} for ref_gene in co_clustered: ref_tag = header_to_label[ref_gene[1:]].split('|')[1] if ref_tag in ref_upstreams and ref_gene in ref_seqs: # exclude rare cases where either piece of information is missing ref_seq = ref_seqs[ref_gene] ref_ups = ref_upstreams[ref_tag][:53] if not ref_seq in seq_distr: seq_distr[ref_seq] = 0 if not ref_ups in ups_distr: ups_distr[ref_ups] = 0 seq_distr[ref_seq] += 1 ups_distr[ref_ups] += 1 ''' Add in query sequence ''' if not query_seq in seq_distr: seq_distr[query_seq] = 0 if not query_ups in ups_distr: ups_distr[query_ups] = 0 seq_distr[query_seq] += 1 ups_distr[query_ups] += 1 ''' Report allele distribution ''' query_seq_count = seq_distr[query_seq] query_ups_count = ups_distr[query_ups] if rare_limit < 0 or (query_seq_count <= rare_limit and query_ups_count <= rare_limit): extreme_cases += 1 if not reported_gene: log_to_file('\n-------------', extreme_cases, 'GENE:', match_gene, '<->', query_gene, '-------------') log_to_file('\nImpacted Reactions:', ', '.join(impacted_reactions), '\n') reported_gene = True ''' If sequence is unique, compute distances to nearest sequence in cluster ''' seq_neighbor = '' ups_neighbor = '' if query_seq_count == 1: # query sequence is new min_seq_dist = len(query_seq) for seq in seq_distr: # start with hamming distance as quick estimate if seq != query_seq: dist = hamming_distance(seq, query_seq) if dist < min_seq_dist: min_seq_dist = dist seq_neighbor = seq if min_seq_dist > 2: # if hamming distance is large, compute edit distance in full for seq in seq_distr: if seq != query_seq: dist = edit_distance(seq, query_seq) if dist < min_seq_dist: min_seq_dist = dist seq_neighbor = seq min_seq_dist = int(min_seq_dist) log_to_file( 'Unique sequence, distance to nearest neighbor:', min_seq_dist) if query_ups_count == 1: # query upstream sequence is new min_ups_dist = len(query_ups) for ups in ups_distr: if ups != query_ups: dist = hamming_distance(ups, query_ups) if dist < min_ups_dist: min_ups_dist = dist ups_neighbor = ups if min_ups_dist > 2: # if hamming distance is large, compute edit distance in full for ups in ups_distr: if ups != query_ups: dist = edit_distance(ups, query_ups) if dist < min_ups_dist: min_ups_dist = dist ups_neighbor = ups min_ups_dist = int(min_ups_dist) log_to_file( 'Unique upstream, distance to nearest neighbor:', min_ups_dist) ''' Report previews of co-clustered sequences/upstreams ''' log_to_file('\nCoding sequence distribution:') log_to_file('Count\tLength\tSeq') for seq_allele in sorted(seq_distr.keys()): count = str(seq_distr[seq_allele]) if seq_allele == query_seq: count += '*' elif seq_allele == seq_neighbor: count += '^' log_to_file(count, '\t', len(seq_allele), '\t', seq_allele[:50] + '...') log_to_file('\nUpstream sequence distribution:') log_to_file('Count\tSeq') for ups_allele in sorted(ups_distr.keys()): count = str(ups_distr[ups_allele]) if ups_allele == query_ups: count += '*' elif ups_allele == ups_neighbor: count += '^' log_to_file(count, '\t', ups_allele)
def test_folder(self, test_folder): for wav_file in sorted(os.listdir(test_folder)): # Read input test file wav_path = os.path.join(test_folder, wav_file) dump_path = wav_path[:-4] + '_pred.txt' # Read only wav if wav_file == '.DS_Store' or wav_file.split( '.')[-1] != 'wav': # or os.path.exists(dump_path): continue feat = utils.read_wav(wav_path, winlen=self.config['window_size'], winstep=self.config['window_step'], fbank_filt=self.config['n_fbank'], mfcc_filt=self.config['n_mfcc']) tsteps, hidden_dim = feat.shape # calculate log mel filterbank energies for complete file feat_log_full = np.reshape(feat, (1, tsteps, hidden_dim)) lens = np.array([tsteps]) # prepare tensors inputs, lens = torch.from_numpy( np.array(feat_log_full)).float(), torch.from_numpy( np.array(lens)).long() id_to_phone = {v[0]: k for k, v in self.model.phone_to_id.items()} self.model.eval() with torch.no_grad(): if self.cuda: inputs = inputs.cuda() lens = lens.cuda() # Pass through model outputs = self.model(inputs, lens).cpu().numpy() # Since only one example per batch and ignore blank token outputs = outputs[0] # softmax = np.exp(outputs) / np.sum(np.exp(outputs), axis=1)[:, None] # Take argmax to generate final string argmaxed = np.argmax(outputs, axis=1) # collapse according to CTC rules final_str = utils.collapse_frames(argmaxed, self.model.blank_token_id) ans = [id_to_phone[a] for a in final_str] # Generate dumpable format of phone, start time and end time print("Predicted:", ans) phone_path = wav_path[:-3] + 'PHN' # If .PHN file exists, report edit distance if os.path.exists(phone_path): truth = utils.read_PHN_file(phone_path) edit_dist, ops = utils.edit_distance(truth, ans) print("Ground Truth:", truth, '\nEdit dsitance:', edit_dist) with open(dump_path, 'w') as f: f.write('Predicted:\n') f.write(' '.join(ans)) f.write('\nGround Truth:\n') f.write(' '.join(truth)) f.write('\nEdit distance: ' + str(edit_dist)) else: with open(dump_path, 'w') as f: f.write('Predicted:\n') f.write(' '.join(ans))
f" [INFO] {len(predicted)} predictions decoded in {round(time.time() - start, 2)} sec. " ) if result_path is not None: if len(fnames) != len(predicted_text): fnames = [ fname for fname in bboxs for j in range(len(bboxs[fname])) ] out = pd.DataFrame({"fname": fnames, "prediction": predicted_text}) out_name = os.path.join(result_path, "prediction.csv") out.to_csv(out_name) print(" [INFO] Prediction example: \n", predicted_text[:10]) print(" [INFO] Result store in: ", out_name) if validate: print(" [INFO] Computing edit distance metric... ") start = time.time() true_text = [ decoder.labels_to_text(y_true[i]) for i in range(len(y_true)) ] print(" [INFO] Example pairs (predicted, true): \n", list(zip(predicted_text[:10], true_text[:10]))) edit_distance_score = edit_distance(predicted_text, true_text) normalized_edit_distance_score = normalized_edit_distance( predicted_text, true_text) print( f" [INFO] edit distances calculated in {round(time.time() - start, 2)} sec. " ) print( " [INFO] mean edit distance: %f ; normalized edit distance score: %f" % (edit_distance_score, normalized_edit_distance_score))
def las_model_fn(features, labels, mode, config, params, binf2phone=None, run_name=None): if tf.estimator.ModeKeys.PREDICT == mode: params.use_text = False encoder_inputs = features['encoder_inputs'] source_sequence_length = features['source_sequence_length'] decoder_inputs, decoder_inputs_binf = None, None targets = None target_sequence_length = None binf_embedding = None if binf2phone is not None and params.decoder.binary_outputs: binf_embedding = tf.constant(binf2phone, dtype=tf.float32, name='binf2phone') is_binf_outputs = params.decoder.binary_outputs and params.decoder.binf_sampling and ( binf_embedding is None or mode == tf.estimator.ModeKeys.TRAIN) mapping = None if params.mapping and binf_embedding is not None: mapping = tf.convert_to_tensor(params.mapping) if mode != tf.estimator.ModeKeys.PREDICT: decoder_inputs = labels['targets_inputs'] targets = labels['targets_outputs'] if mapping is not None: decoder_inputs = tf.nn.embedding_lookup(mapping, decoder_inputs) targets = tf.nn.embedding_lookup(mapping, targets) target_sequence_length = labels['target_sequence_length'] if binf_embedding is not None: targets_binf = tf.nn.embedding_lookup(tf.transpose(binf_embedding), targets) decoder_inputs_binf = tf.nn.embedding_lookup( tf.transpose(binf_embedding), decoder_inputs) text_loss = 0 text_edit_distance = reader_encoder_state = None if params.use_text: tf.logging.info('Building reader') with tf.variable_scope('reader'): (reader_encoder_outputs, reader_source_sequence_length ), reader_encoder_state = text_ae.model.reader( decoder_inputs, target_sequence_length, mode, params.encoder, params.decoder.target_vocab_size) tf.logging.info('Building writer') with tf.variable_scope('writer'): writer_decoder_outputs, writer_final_context_state, writer_final_sequence_length = text_ae.model.speller( reader_encoder_outputs, reader_encoder_state, decoder_inputs, reader_source_sequence_length, target_sequence_length, mode, params.decoder) with tf.name_scope('text_prediciton'): logits = writer_decoder_outputs.rnn_output sample_ids = tf.to_int32(tf.argmax(logits, -1)) with tf.name_scope('text_metrics'): text_edit_distance = utils.edit_distance( sample_ids, targets, utils.EOS_ID, params.mapping if mapping is None else None) metrics = { 'text_edit_distance': tf.metrics.mean(text_edit_distance), } tf.summary.scalar('text_edit_distance', metrics['text_edit_distance'][1]) with tf.name_scope('text_cross_entropy'): text_loss = compute_loss(logits, targets, writer_final_sequence_length, target_sequence_length, mode) tf.logging.info('Building listener') with tf.variable_scope('listener'): (encoder_outputs, source_sequence_length), encoder_state = las.model.listener( encoder_inputs, source_sequence_length, mode, params.encoder) tf.logging.info('Building speller') with tf.variable_scope('speller'): decoder_outputs, final_context_state, final_sequence_length = las.model.speller( encoder_outputs, encoder_state, decoder_inputs, source_sequence_length, target_sequence_length, mode, params.decoder) decoder_outputs_binf, final_context_state_binf, final_sequence_length_binf = None, None, None if params.decoder.binary_outputs: with tf.variable_scope('speller_binf'): decoder_outputs_binf, final_context_state_binf, final_sequence_length_binf = las.model.speller( encoder_outputs, encoder_state, decoder_inputs_binf, source_sequence_length, target_sequence_length, mode, params.decoder, True, binf_embedding if not params.decoder.binf_sampling or params.decoder.beam_width > 0 else None) sample_ids_phones_binf, sample_ids_binf, logits_binf = None, None, None with tf.name_scope('prediction'): if mode == tf.estimator.ModeKeys.PREDICT and params.decoder.beam_width > 0: logits = tf.no_op() sample_ids_phones = decoder_outputs.predicted_ids if decoder_outputs_binf is not None: sample_ids_phones_binf = decoder_outputs_binf.predicted_ids else: logits = decoder_outputs.rnn_output sample_ids_phones = tf.to_int32(tf.argmax(logits, -1)) if decoder_outputs_binf is not None: logits_binf = decoder_outputs_binf.rnn_output if params.decoder.binary_outputs and params.decoder.binf_sampling: sample_ids_binf = tf.to_int32( tf.round(tf.sigmoid(logits_binf))) logits_phones_binf = transform_binf_to_phones( logits_binf, binf_embedding) sample_ids_phones_binf = tf.to_int32( tf.argmax(logits_phones_binf, -1)) else: sample_ids_phones_binf = tf.to_int32( tf.argmax(logits_binf, -1)) if mode == tf.estimator.ModeKeys.PREDICT: emb_c = tf.concat([x.c for x in encoder_state], axis=1) emb_h = tf.concat([x.h for x in encoder_state], axis=1) emb = tf.stack([emb_c, emb_h], axis=1) predictions = { 'sample_ids': sample_ids_phones, 'embedding': emb, 'encoder_out': encoder_outputs, 'source_length': source_sequence_length } if logits_binf is not None: predictions['logits_binf'] = logits_binf if sample_ids_phones_binf is not None: predictions['sample_ids_phones_binf'] = sample_ids_phones_binf predictions['alignment'] = get_alignment_history( final_context_state, params) if final_context_state_binf is not None: predictions['alignment_binf'] = get_alignment_history( final_context_state_binf, params) if params.decoder.beam_width == 0: if params.decoder.binary_outputs and binf_embedding is None: predictions['probs'] = tf.nn.sigmoid(logits_binf) else: predictions['probs'] = tf.nn.softmax(logits) return tf.estimator.EstimatorSpec(mode, predictions=predictions) metrics = None with tf.name_scope('metrics'): edit_distance = utils.edit_distance( sample_ids_phones, targets, utils.EOS_ID, params.mapping if mapping is None else None) metrics = { 'edit_distance': tf.metrics.mean(edit_distance), } if params.use_text and not params.emb_loss: pass else: # In TRAIN model this becomes an significantly affected by early high values. # As a result in summaries train values would be high and drop after restart. # To prevent this, we use last batch average in case of TRAIN. if mode != tf.estimator.ModeKeys.TRAIN: tf.summary.scalar('edit_distance', metrics['edit_distance'][1]) else: tf.summary.scalar('edit_distance', tf.reduce_mean(edit_distance)) with tf.name_scope('cross_entropy'): audio_loss = compute_loss(logits, targets, final_sequence_length, target_sequence_length, mode) if is_binf_outputs: with tf.name_scope('cross_entropy_binf'): audio_loss_binf = compute_loss_sigmoid(logits_binf, targets_binf, final_sequence_length, target_sequence_length, mode) audio_loss += audio_loss_binf emb_loss = 0 if params.use_text: with tf.name_scope('embeddings_loss'): emb_loss = compute_emb_loss(encoder_state, reader_encoder_state) if mode == tf.estimator.ModeKeys.EVAL: with tf.name_scope('alignment'): attention_images = utils.create_attention_images( final_context_state) if params.use_text and not params.emb_loss: hooks = [] loss = text_loss else: run_name = run_name or 'eval' if run_name != 'eval': # For other summaries eval is automatically added. run_name = 'eval_{}'.format(run_name) attention_summary = tf.summary.image('attention_images', attention_images) eval_summary_hook = tf.train.SummarySaverHook( save_steps=20, output_dir=os.path.join(config.model_dir, run_name), summary_op=attention_summary) hooks = [eval_summary_hook] loss = audio_loss log_data = { 'edit_distance': tf.reduce_mean(edit_distance), 'max_edit_distance': tf.reduce_max(edit_distance), 'min_edit_distance': tf.reduce_min(edit_distance) } if params.use_text: if not params.emb_loss: log_data = {} else: log_data['emb_loss'] = tf.reduce_mean(emb_loss) log_data['text_edit_distance'] = tf.reduce_mean(text_edit_distance) logging_hook = tf.train.LoggingTensorHook(log_data, every_n_iter=20) hooks += [logging_hook] return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=metrics, evaluation_hooks=hooks) with tf.name_scope('train'): optimizer = tf.train.AdamOptimizer(params.learning_rate) var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) if params.use_text: audio_var_list = [ x for x in var_list if not x.name.startswith('reader') and not x.name.startswith('writer') ] total_params = np.sum( [np.prod(x.shape.as_list()) for x in audio_var_list]) tf.logging.info( 'Trainable audio parameters: {}'.format(total_params)) text_var_list = [ x for x in var_list if not x.name.startswith('listener') and not x.name.startswith('speller') ] total_params = np.sum( [np.prod(x.shape.as_list()) for x in text_var_list]) tf.logging.info( 'Trainable text parameters: {}'.format(total_params)) gvs = optimizer.compute_gradients(audio_loss, var_list=audio_var_list) capped_gvs = [(tf.clip_by_norm(grad, GRAD_NORM), var) for grad, var in gvs] audio_train_op = optimizer.apply_gradients( capped_gvs, global_step=tf.train.get_global_step()) gvs = optimizer.compute_gradients(text_loss, var_list=text_var_list) # No attention means that top layers won't affect anything.Thus gradients for them would be None. capped_gvs = [(tf.clip_by_norm(grad, GRAD_NORM), var) for grad, var in gvs if grad is not None] text_train_op = optimizer.apply_gradients( capped_gvs, global_step=tf.train.get_global_step()) gvs = optimizer.compute_gradients(emb_loss, var_list=audio_var_list) capped_gvs = [(tf.clip_by_norm(grad, GRAD_NORM), var) for grad, var in gvs if grad is not None] emb_train_op = optimizer.apply_gradients( capped_gvs, global_step=tf.train.get_global_step()) if not params.text_loss: tf.logging.info( 'Removing reader and writer from optimization.') train_op = tf.group(audio_train_op, emb_train_op) elif not params.emb_loss: tf.logging.info( 'Removing listener and speller from optimization params.') train_op = text_train_op else: raise ValueError( 'Either text_loss or emb_loss must be set with use_text!') else: total_params = np.sum( [np.prod(x.shape.as_list()) for x in var_list]) tf.logging.info('Trainable parameters: {}'.format(total_params)) regularizer = tf_contrib.layers.l2_regularizer(params.l2_reg_scale) reg_term = tf.contrib.layers.apply_regularization( regularizer, var_list) audio_loss = audio_loss + reg_term gvs = optimizer.compute_gradients(audio_loss, var_list=var_list) capped_gvs = [(tf.clip_by_norm(grad, GRAD_NORM), var) for grad, var in gvs] train_op = optimizer.apply_gradients( capped_gvs, global_step=tf.train.get_global_step()) if params.add_noise > 0: def add_noise(): noise_ops = [train_op] for var in var_list: if var.name.endswith('kernel:0'): shape = tf.shape(var) noise_op = tf.assign_add( var, tf.random_normal(shape, NOISE_MEAN, params.noise_std, dtype=tf.float32)) noise_ops.append(noise_op) print_op = tf.print('Adding noise to weights') return tf.group(*noise_ops, print_op) train_op = tf.cond( tf.logical_and( tf.equal( tf.mod(tf.train.get_global_step(), params.add_noise), 0), tf.greater(tf.train.get_global_step(), 0)), add_noise, lambda: train_op) loss = text_loss if params.use_text and not params.emb_loss else audio_loss train_log_data = {'loss': loss} if params.use_text: if params.emb_loss: train_log_data['edit_distance'] = tf.reduce_mean(edit_distance) train_log_data['emb_loss'] = tf.reduce_mean(emb_loss) train_log_data['text_edit_distance'] = tf.reduce_mean( text_edit_distance) else: train_log_data['edit_distance'] = tf.reduce_mean(edit_distance) logging_hook = tf.train.LoggingTensorHook(train_log_data, every_n_iter=10) return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op, training_hooks=[logging_hook])
def test(self, epoch=None): self.model.eval() # edit distance of batch edit_dist_batch = 0 # number of sequences total_phones = 0 # decode type decode_type = self.config['decode_type'] # operations dictionary for calculating probabilities num_ph = self.model.num_phones op_dict = {} for i in range(num_ph): op_dict[i] = { 'matches': 0, 'insertions': 0, 'deletions': 0, 'substitutions': np.zeros(self.model.num_phones), 'total': 0 } print("Testing...") print('Total batches:', len(self.test_loader)) test_loss = 0 num_sequences = 0 # to_dump_probs, to_dump_labels = [], [] with torch.no_grad(): if self.using_custom: dropout_mask_reset = [True ] * (self.model.num_layers * (1 + self.config['bidirectional'])) else: dropout_mask_reset = None while True: # retrieve batch from dataloader inputs, labels, input_lens, label_lens, status = self.test_loader.return_batch( self.cuda) # zero the parameter gradients self.model.optimizer.zero_grad() # forward if self.using_custom: outputs = self.model(inputs, input_lens, dropout_mask_reset) dropout_mask_reset = [False] * ( self.model.num_layers * (1 + self.config['bidirectional'])) else: outputs = self.model(inputs, input_lens) # calculate loss loss = self.model.calculate_loss(outputs, labels, input_lens, label_lens) print(loss) test_loss += loss.item() outputs = outputs.cpu().numpy() labels = labels.cpu().numpy() num_sequences += outputs.shape[0] # calculate edit distance between ground truth and predicted sequence for i in range(outputs.shape[0]): # predict by argmax if decode_type == 'max': # argmax over the phone channel argmaxed = np.argmax(outputs, axis=2) seq = list(argmaxed[i][:input_lens[i]]) # collapse neighbouring and remove blank token output_seq = utils.collapse_frames( seq, self.model.blank_token_id) else: # predict by CTC outputs = utils.softmax(outputs) output_seq = decode(outputs[i, :input_lens[i], :], 1, self.model.blank_token_id)[0][0] # ground truth gr_truth = list(labels[i][:label_lens[i]]) # to_dump_probs.append(outputs[i][:input_lens[i], :]) # to_dump_labels.append(labels[i][:label_lens[i]]) # calculated edit distance and required operations dist, opr = utils.edit_distance(gr_truth, output_seq) # increment number of phones total_phones += len(gr_truth) # update number of operations for op_type, ids in opr.items(): if op_type == 'substitutions': for orig, replace in ids: op_dict[orig]['substitutions'][replace] += 1 op_dict[orig]['total'] += 1 else: for idx in ids: op_dict[idx][op_type] += 1 op_dict[idx]['total'] += 1 edit_dist_batch += dist if status == 1: break print("Done with:", num_sequences, '/', self.test_loader.num_egs) # Average out the losses and edit distance test_loss /= len(self.test_loader) edit_dist_batch /= total_phones print("Edit distance - %.4f %% , Loss: %.7f" % (edit_dist_batch * 100, test_loss)) # Store in lists for keeping track of model performance self.edit_dist.append((edit_dist_batch, epoch)) self.test_losses.append((test_loss, epoch)) # if testing loss is minimum, store it as the 'best.pth' model, which is used for feature extraction # store only when doing train/test together i.e. mode is train # dump probabilities prob_insert, prob_del, prob_substi = np.zeros(num_ph), np.zeros( num_ph), np.zeros((num_ph, num_ph)) if test_loss == min([x[0] for x in self.test_losses ]) and self.mode == 'train': print("Best new model found!") self.model.save_model(True, epoch, self.train_losses, self.test_losses, self.edit_dist, self.arch_name) # Calculate the probabilities of insertion, deletion and substitution for ph, data in op_dict.items(): prob_insert[ph] = data['insertions'] / data['total'] if data[ 'total'] else 0 prob_del[ph] = data['deletions'] / data['total'] if data[ 'total'] else 0 prob_substi[ph] = data['substitutions'] / data[ 'total'] if data['total'] else 0 # Dump best probability prob_dump_path = os.path.join(self.config['dir']['pickle'], self.arch_name, 'probs.pkl') with open(prob_dump_path, 'wb') as f: pickle.dump((prob_insert, prob_del, prob_substi), f) print("Dumped best probabilities") if self.mode == 'train': # Dump probabilities prob_dump_path = os.path.join(self.config['dir']['pickle'], self.arch_name, str(epoch) + '_probs.pkl') with open(prob_dump_path, 'wb') as f: pickle.dump((prob_insert, prob_del, prob_substi), f) print("Dumped probabilities") # with open('test_res.pkl', 'wb') as f: # pickle.dump((to_dump_probs, to_dump_labels), f) self.model.train() return edit_dist_batch