def augment(model_path, input_nbest_path, vocab_path, output_nbest_path): classifier = MLP(model_path=model_path) evaluator = eval.Evaluator(None, classifier) vocab = VocabManager(vocab_path) ngram_size = classifier.ngram_size def get_ngrams(tokens): for i in range(ngram_size - 1): tokens.insert(0, '<s>') if vocab.has_end_padding: tokens.append('</s>') indices = vocab.get_ids_given_word_list(tokens) return U.get_all_windows(indices, ngram_size) input_nbest = NBestList(input_nbest_path, mode='r') output_nbest = NBestList(output_nbest_path, mode='w') L.info('Augmenting: ' + input_nbest_path) start_time = time.time() counter = 0 cache = dict() for group in input_nbest: ngram_list = [] for item in group: tokens = item.hyp.split() ngrams = get_ngrams(tokens) for ngram in ngrams: if not cache.has_key(str(ngram)): ngram_list.append(ngram) cache[str(ngram)] = 1000 if len(ngram_list) > 0: ngram_array = np.asarray(ngram_list, dtype='int32') ngram_log_prob_list = evaluator.get_ngram_log_prob( ngram_array[:, 0:-1], ngram_array[:, -1]) for i in range(len(ngram_list)): cache[str(ngram_list[i])] = ngram_log_prob_list[i] for item in group: tokens = item.hyp.split() ngrams = get_ngrams(tokens) sum_ngram_log_prob = 0 for ngram in ngrams: sum_ngram_log_prob += cache[str(ngram)] item.append_feature(sum_ngram_log_prob) output_nbest.write(item) #print counter counter += 1 output_nbest.close() L.info("Ran for %.2fs" % (time.time() - start_time))
def augment(model_path, input_nbest_path, vocab_path, output_nbest_path): classifier = MLP(model_path=model_path) evaluator = eval.Evaluator(None, classifier) vocab = VocabManager(vocab_path) ngram_size = classifier.ngram_size def get_ngrams(tokens): for i in range(ngram_size - 1): tokens.insert(0, '<s>') if vocab.has_end_padding: tokens.append('</s>') indices = vocab.get_ids_given_word_list(tokens) return U.get_all_windows(indices, ngram_size) input_nbest = NBestList(input_nbest_path, mode='r') output_nbest = NBestList(output_nbest_path, mode='w') L.info('Augmenting: ' + input_nbest_path) start_time = time.time() counter = 0 cache = dict() for group in input_nbest: ngram_list = [] for item in group: tokens = item.hyp.split() ngrams = get_ngrams(tokens) for ngram in ngrams: if not cache.has_key(str(ngram)): ngram_list.append(ngram) cache[str(ngram)] = 1000 if len(ngram_list) > 0: ngram_array = np.asarray(ngram_list, dtype='int32') ngram_log_prob_list = evaluator.get_ngram_log_prob(ngram_array[:,0:-1], ngram_array[:,-1]) for i in range(len(ngram_list)): cache[str(ngram_list[i])] = ngram_log_prob_list[i] for item in group: tokens = item.hyp.split() ngrams = get_ngrams(tokens) sum_ngram_log_prob = 0 for ngram in ngrams: sum_ngram_log_prob += cache[str(ngram)] item.append_feature(sum_ngram_log_prob) output_nbest.write(item) #print counter counter += 1 output_nbest.close() L.info("Ran for %.2fs" % (time.time() - start_time))
try: group_list.append(input_nbest.next()) except StopIteration: flag = False if len(group_list) > 0: outputs = pool.map(process_group, group_list) for i in range(len(group_list)): scores = outputs[i] group = group_list[i] sorted_indices = sorted(scores, key=scores.get, reverse=True) if args.out_scores_path: for idx in scores: output_scores.write( str(group.group_index) + ' ' + str(idx) + ' ' + str(scores[idx]) + "\n") if args.out_nbest_path: for idx in sorted_indices: output_nbest.write(group[idx]) output_1best.write(group[sorted_indices[0]].hyp + "\n") counter += 1 group_counter += len(group_list) if counter % 5 == 0: L.info("%i groups processed" % (group_counter)) L.info("Finished processing %i groups" % (group_counter)) if args.out_scores_path: output_scores.close() if args.out_nbest_path: output_nbest.close() output_1best.close()
group_list = [] for i in range(args.threads): try: group_list.append(input_nbest.next()) except StopIteration: flag = False if len(group_list) > 0: outputs = pool.map(process_group, group_list) for i in range(len(group_list)): scores = outputs[i] group = group_list[i] sorted_indices = sorted(scores, key=scores.get, reverse=True) if args.out_scores_path: for idx in scores: output_scores.write(str(group.group_index) + ' ' + str(idx) + ' ' + str(scores[idx]) + "\n") if args.out_nbest_path: for idx in sorted_indices: output_nbest.write(group[idx]) output_1best.write(group[sorted_indices[0]].hyp + "\n") counter += 1 group_counter += len(group_list) if counter % 5 == 0: L.info("%i groups processed" % (group_counter)) L.info("Finished processing %i groups" % (group_counter)) if args.out_scores_path: output_scores.close() if args.out_nbest_path: output_nbest.close() output_1best.close()