def get_clean_clues(json_file_or_json_dir, load_from_json_files: bool = False, verify=True, ) -> Tuple[Dict[str, List[BaseClue]], List[BaseClue]]: if load_from_json_files: soln_to_clue_map, all_clue_list = orig_get_clean_clues( json_file_or_json_dir) else: with open(json_file_or_json_dir, 'r') as f: all_clue_list = json.load(f) all_clue_list = list(map(CleanGuardianClue.from_json, all_clue_list)) soln_to_clue_map = make_stc_map(all_clue_list) # add indices and a note about dataset for idx, c in enumerate(all_clue_list): c.idx = idx # if not strip_identifying_info: # c.dataset = json_output_dir # print the distribution ctr = Counter() for c in all_clue_list: ctr[len(c.lengths)] += 1 log.info(ctr) # Verify same length assert sum(map(len, soln_to_clue_map.values())) == len(all_clue_list) if verify: assert len(all_clue_list) == 142380, f'Your clues do not match the ones in Decrypting paper' log.info(f'Clue list length matches Decrypting paper expected length') return soln_to_clue_map, all_clue_list
def load_splits_from_json(json_file_name) -> SplitReturn: try: log.info(f'Loading splits directly from given json files. Using {json_file_name}') with open(json_file_name, 'r') as f: splits_dict = json.load(f) except FileNotFoundError as e: log.error('Json not found. Did you unzip as per the readme?') raise e split_tuple = splits_dict['train'], splits_dict['val'], splits_dict['test'] clue_list_tuple = tuple([list(map(CleanGuardianClue.from_json, split)) for split in split_tuple]) all_clues = [c for clue_list in clue_list_tuple for c in clue_list] soln_to_clue_map = make_stc_map(all_clues) # note that there will be no indices in these clues # print the distribution ctr = Counter() for c in all_clues: ctr[len(c.lengths)] += 1 log.info(ctr) # Verify same length assert sum(map(len, soln_to_clue_map.values())) == len(all_clues) assert len(all_clues) == 142380, f'Your clues do not match the ones in Decrypting paper' log.info(f'Clue list length matches Decrypting paper expected length') check_splits(all_clues, clue_list_tuple) return soln_to_clue_map, all_clues, clue_list_tuple
def orig_get_clean_clues(json_output_dir, do_filter_dupes=True, verify=True, strip_identifying_info=False, ) -> Tuple[Dict[str, List[BaseClue]], List[BaseClue]]: log.info(f'loading from {json_output_dir}') parsed_puzzles: Dict[str, List[GuardianClue]] = defaultdict(None) # map from puz_id => List[GuardianClue] # load full glob if strip_identifying_info: clue_cls = CleanGuardianClue else: clue_cls = GuardianClue all_clue_list = all_json_files_to_json_list(json_output_dir, subsite="cryptic", puzzle_dict=parsed_puzzles, skip_if_in_dict=True, verify=verify, clue_cls=clue_cls) soln_to_clue_map = make_stc_map(all_clue_list) # Remove anything that is exactly the same up to small diffs # removes 1610 normalized clues if do_filter_dupes: soln_to_clue_map, all_clue_list = filter_clues(soln_to_clue_map) return soln_to_clue_map, all_clue_list
def make_disjoint_split(all_clues: List[BaseClue], seed=42) -> Tuple[List[BaseClue], ...]: soln_to_clue_map = make_stc_map(all_clues) train, val, test = [], [], [] for k, v in soln_to_clue_map.items(): h = safe_hash(k[:2]) % 5 # normal hash function is not deterministic across python runs if h < 3: train.extend(v) elif h < 4: val.extend(v) else: test.extend(v) out_tuple = train, val, test rng = random.Random(seed) for l in out_tuple: rng.shuffle(l) check_splits(all_clues, out_tuple) return out_tuple
def get_clean_xd_clues(filename, remove_if_not_in_dict=True, do_filter_dupes=True) \ -> Tuple[Dict[str, List[BaseClue]], List[BaseClue]]: logging.info(f'loading xd (ACW) set from {filename}') all_clue_list = xd_load_and_filter_clues( filename, remove_if_not_in_dict=remove_if_not_in_dict, strip_trailing_period=True, remove_questions=True, remove_likely_abbreviations=True, remove_fillin=True) # generate soln to clue map # soln:str -> List[gc] soln_to_clue_map = make_stc_map(all_clue_list) # Remove anything that is exactly the same up to small diffs # removes 1610 normalized clues if do_filter_dupes: soln_to_clue_map, all_clue_list = filter_clues(soln_to_clue_map) # add indices and a note about dataset for idx, c in enumerate(all_clue_list): c.idx = idx c.dataset = filename # print the distribution ctr = Counter() for c in all_clue_list: ctr[len(c.lengths)] += 1 logging.info(ctr) # Verify same length assert sum(map(len, soln_to_clue_map.values())) == len(all_clue_list) return soln_to_clue_map, all_clue_list