예제 #1
0
def get_clean_clues(json_file_or_json_dir,
                    load_from_json_files: bool = False,
                    verify=True,
                    ) -> Tuple[Dict[str, List[BaseClue]], List[BaseClue]]:
    if load_from_json_files:
        soln_to_clue_map, all_clue_list = orig_get_clean_clues(
            json_file_or_json_dir)
    else:
        with open(json_file_or_json_dir, 'r') as f:
            all_clue_list = json.load(f)
        all_clue_list = list(map(CleanGuardianClue.from_json, all_clue_list))
        soln_to_clue_map = make_stc_map(all_clue_list)

    # add indices and a note about dataset
    for idx, c in enumerate(all_clue_list):
        c.idx = idx
        # if not strip_identifying_info:
        #     c.dataset = json_output_dir

    # print the distribution
    ctr = Counter()
    for c in all_clue_list:
        ctr[len(c.lengths)] += 1
    log.info(ctr)

    # Verify same length
    assert sum(map(len, soln_to_clue_map.values())) == len(all_clue_list)

    if verify:
        assert len(all_clue_list) == 142380, f'Your clues do not match the ones in Decrypting paper'
        log.info(f'Clue list length matches Decrypting paper expected length')

    return soln_to_clue_map, all_clue_list
예제 #2
0
def load_splits_from_json(json_file_name) -> SplitReturn:
    try:
        log.info(f'Loading splits directly from given json files. Using {json_file_name}')
        with open(json_file_name, 'r') as f:
            splits_dict = json.load(f)
    except FileNotFoundError as e:
        log.error('Json not found. Did you unzip as per the readme?')
        raise e

    split_tuple = splits_dict['train'], splits_dict['val'], splits_dict['test']
    clue_list_tuple = tuple([list(map(CleanGuardianClue.from_json, split)) for split in split_tuple])
    all_clues = [c for clue_list in clue_list_tuple for c in clue_list]
    soln_to_clue_map = make_stc_map(all_clues)

    # note that there will be no indices in these clues

    # print the distribution
    ctr = Counter()
    for c in all_clues:
        ctr[len(c.lengths)] += 1
    log.info(ctr)

    # Verify same length
    assert sum(map(len, soln_to_clue_map.values())) == len(all_clues)

    assert len(all_clues) == 142380, f'Your clues do not match the ones in Decrypting paper'
    log.info(f'Clue list length matches Decrypting paper expected length')

    check_splits(all_clues, clue_list_tuple)

    return soln_to_clue_map, all_clues, clue_list_tuple
예제 #3
0
def orig_get_clean_clues(json_output_dir,
                    do_filter_dupes=True,
                    verify=True,
                    strip_identifying_info=False,
                    ) -> Tuple[Dict[str, List[BaseClue]], List[BaseClue]]:
    log.info(f'loading from {json_output_dir}')
    parsed_puzzles: Dict[str, List[GuardianClue]] = defaultdict(None)  # map from puz_id => List[GuardianClue]

    # load full glob
    if strip_identifying_info:
        clue_cls = CleanGuardianClue
    else:
        clue_cls = GuardianClue
    all_clue_list = all_json_files_to_json_list(json_output_dir,
                                                subsite="cryptic",
                                                puzzle_dict=parsed_puzzles,
                                                skip_if_in_dict=True,
                                                verify=verify,
                                                clue_cls=clue_cls)


    soln_to_clue_map = make_stc_map(all_clue_list)

    # Remove anything that is exactly the same up to small diffs
    # removes 1610 normalized clues
    if do_filter_dupes:
        soln_to_clue_map, all_clue_list = filter_clues(soln_to_clue_map)

    return soln_to_clue_map, all_clue_list
예제 #4
0
def make_disjoint_split(all_clues: List[BaseClue],
                        seed=42) -> Tuple[List[BaseClue], ...]:

    soln_to_clue_map = make_stc_map(all_clues)
    train, val, test = [], [], []
    for k, v in soln_to_clue_map.items():
        h = safe_hash(k[:2]) % 5  # normal hash function is not deterministic across python runs
        if h < 3:
            train.extend(v)
        elif h < 4:
            val.extend(v)
        else:
            test.extend(v)

    out_tuple = train, val, test
    rng = random.Random(seed)
    for l in out_tuple:
        rng.shuffle(l)
    check_splits(all_clues, out_tuple)

    return out_tuple
예제 #5
0
def get_clean_xd_clues(filename,
                       remove_if_not_in_dict=True,
                       do_filter_dupes=True) \
    -> Tuple[Dict[str, List[BaseClue]], List[BaseClue]]:

    logging.info(f'loading xd (ACW) set from {filename}')
    all_clue_list = xd_load_and_filter_clues(
        filename,
        remove_if_not_in_dict=remove_if_not_in_dict,
        strip_trailing_period=True,
        remove_questions=True,
        remove_likely_abbreviations=True,
        remove_fillin=True)

    # generate soln to clue map
    # soln:str -> List[gc]
    soln_to_clue_map = make_stc_map(all_clue_list)

    # Remove anything that is exactly the same up to small diffs
    # removes 1610 normalized clues
    if do_filter_dupes:
        soln_to_clue_map, all_clue_list = filter_clues(soln_to_clue_map)

    # add indices and a note about dataset
    for idx, c in enumerate(all_clue_list):
        c.idx = idx
        c.dataset = filename

    # print the distribution
    ctr = Counter()
    for c in all_clue_list:
        ctr[len(c.lengths)] += 1
    logging.info(ctr)

    # Verify same length
    assert sum(map(len, soln_to_clue_map.values())) == len(all_clue_list)

    return soln_to_clue_map, all_clue_list