def score_results(self, results, verbose=False): # results should be a dictionary mapping instr_ids to dictionaries, # with each dictionary containing (at least) a 'words' field instr_ids = set(self.instr_ids) instr_count = 0 results_by_base_id = {} mismatches = [] for instr_id, result in results.items(): if instr_id in instr_ids: instr_ids.remove(instr_id) base_id = int(instr_id.split('_')[0]) if base_id in results_by_base_id: old_predicted = results_by_base_id[base_id]['words'] new_predicted = result['words'] if old_predicted != new_predicted: mismatches.append((old_predicted, new_predicted)) else: results_by_base_id[base_id] = result if mismatches: print("mismatching outputs for sentences:") for old_pred, new_pred in mismatches: print(old_pred) print(new_pred) print() assert len(instr_ids) == 0, \ 'Missing %d of %d instruction ids from %s' % ( len(instr_ids), len(self.instr_ids), ",".join(self.splits)) all_refs = [] all_hyps = [] model_scores = [] instruction_replaced_gt = [] skip_count = 0 skipped_refs = set() for base_id, result in sorted(results_by_base_id.items()): instr_count += 1 gt = self.gt[base_id] tokenized_refs = [ Tokenizer.split_sentence(ref) for ref in gt['instructions'] ] tokenized_hyp = result['words'] replaced_gt = gt.copy() replaced_gt['instructions'] = [' '.join(tokenized_hyp)] instruction_replaced_gt.append(replaced_gt) if 'score' in result: model_scores.append(result['score']) if len(tokenized_refs) != self.instructions_per_path: skip_count += 1 skipped_refs.add(base_id) continue all_refs.append(tokenized_refs) all_hyps.append(tokenized_hyp) if verbose and instr_count % 100 == 0: for i, ref in enumerate(tokenized_refs): print("ref {}:\t{}".format(i, ' '.join(ref))) print("pred :\t{}".format(' '.join(tokenized_hyp))) print() if skip_count != 0: print("skipped {} instructions without {} refs: {}".format( skip_count, self.instructions_per_path, ' '.join(str(i) for i in skipped_refs))) model_score = np.mean(model_scores) bleu, unpenalized_bleu = multi_bleu(all_refs, all_hyps) score_summary = { 'model_score': model_score, 'bleu': bleu, 'unpenalized_bleu': unpenalized_bleu, } return score_summary, instruction_replaced_gt
# for item in data: # for instr in item['instructions']: # count.update(Tokenizer.split_sentence(instr)) # vocab = list(start_vocab) # for word, num in count.most_common(): # if num >= min_count: # vocab.append(word) # else: # break all_results = pickle.load(open("results.pkl", 'rb')) for env_name, results in all_results.items(): print(env_name) for path_id, result in results.items(): print(result) inf = tok.split_sentence(result['inference']) inf_count.update(inf) ref = np.random.choice([tok.split_sentence(x) for x in result['gt']]) ref_count.update(ref) infs = sorted(inf_count.values(), reverse=True) refs = sorted(ref_count.values(), reverse=True) plt.plot(infs, label="Inferred Language") plt.plot(refs, label="Train Language") plt.title(f"Distribution of Vocabulary") plt.xlabel("Words") plt.legend(loc="upper right") plt.ylabel('Amount of Usage') plt.savefig("vocab_dist.png") plt.cla()
d_4 = ['front','back'] d_5 = ['above','under'] d_6 = ['enter','exit'] d_7 = ['backward','forward'] d_8 = ['away from', 'towards'] d_9 = ['into','out of'] d_10 = ['inside','outside'] #d_ls = [d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10] d_ls = d_1+d_2+d_3+d_4+d_5+d_6+d_7+d_8+d_9+d_10 direct = [] while len(direct) < perturb_num: i = np.random.randint(len(pairs_idx)) t_i = pairs_idx[i] ins_i = data[t_i[0]]['instructions'][t_i[1]].lower() words = Tokenizer.split_sentence(ins_i) if any(word in words for word in d_ls): direct.append(pairs_idx.pop(i)) # preselect viewpoint swap upper = 0.6 lower = 0.3 import networkx as nx from ndtw import DTW,load_nav_graphs # Load connectiviy graph scans = [] for traj in data: if traj['scan'] not in scans: scans.append(traj['scan']) graphs = load_nav_graphs(scans)