def backwords_counter(nwords_list: TextIO, splitter: str, start_chr: str, end_chr: str, start4words: int, step4words: int, threshold: int, max_gram: int): nwords_dict: Dict[Tuple, Dict[str, int]] = defaultdict(lambda: defaultdict(int)) zero = tuple() nwords_float_dict = {zero: {}} line_num = wc_l(nwords_list) words: Dict[str, int] = defaultdict(int) section_dict = defaultdict(lambda: defaultdict(int)) for line in tqdm(nwords_list, total=line_num, desc="Reading: "): # type: str line = line.strip("\r\n") sections = [start_chr] sections.extend(parse_line(line, splitter, start4words, step4words)) sections.append(end_chr) for sec in sections: words[sec] += 1 if sec not in {start_chr}: nwords_dict[zero][sec] += 1 section_dict[len(sections)][tuple(sections)] += 1 pass zero_sum = sum(nwords_dict[zero].values()) for trans, p in nwords_dict[zero].items(): nwords_float_dict[zero][trans] = p / zero_sum min_gram = 2 len_list = [_l for _l, s in section_dict.items() if sum(s.values()) >= threshold] max_gram = min(max(len_list), max(2, max_gram)) if max_gram == 1: print(f"max gram is {max_gram}, fail to model the password dataset", file=sys.stderr) sys.exit(-1) for n in tqdm(range(min_gram, max_gram + 1), desc="Counting: "): nwords_dict: Dict[Tuple, Dict[str, int]] = defaultdict(lambda: defaultdict(int)) for sec_len, sections_cnt in section_dict.items(): if n > sec_len: continue for sections, cnt in sections_cnt.items(): prefix_words_num = n - 1 for i in range(len(sections) - prefix_words_num): grams = tuple(sections[i:i + prefix_words_num]) transition = sections[i + prefix_words_num] nwords_dict[grams][transition] += cnt pass for prefix, trans_cnt in nwords_dict.items(): total = sum(trans_cnt.values()) if total < threshold: continue trans_prob = {trans: cnt / total for trans, cnt in trans_cnt.items() if cnt >= threshold} missing = 1 - sum(trans_prob.values()) if missing == 1: continue if missing > 0: parent_prefix = prefix[1:] for trans, p in nwords_float_dict[parent_prefix].items(): trans_prob[trans] = trans_prob.get(trans, 0) + p * missing nwords_float_dict[prefix] = trans_prob del section_dict return nwords_float_dict, words
def nwords_counter(nwords_list: TextIO, n: int = 4, end_chr: str = "\x03", threshold: int = 10): nwords_dict: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int)) prefix_words = n - 1 line_num = wc_l(nwords_list) section_dict = defaultdict(int) words: Dict[str, int] = defaultdict(int) for line in tqdm(nwords_list, total=line_num, desc="Parsing: "): # type: str line = line.strip("\r\n") items = line.split("\t") pwd = items[0] + end_chr raw_sections = items[1::2] start = 0 sections = [] raw_sections.append(end_chr) for sec in raw_sections: word = pwd[start:start + len(sec)] sections.append(word) start += len(sec) words[word] += 1 if "".join(sections) != pwd or len(pwd) < 4: raise Exception("error1") section_dict[tuple(sections)] += 1 needed = {k: v for k, v in words.items() if v >= threshold} nwords_list.close() for sections, cnt in tqdm(section_dict.items(), desc="Counting: "): n_sections = [] for i, sec in enumerate(sections): if sec in needed: n_sections.append(sec) else: n_sections.extend(list(sec)) prev_chrs = "" for sec in n_sections: nwords_dict[prev_chrs][sec] += cnt prev_chrs = f"{prev_chrs}{sec}"[-prefix_words:] del section_dict nwords_float_dict: Dict[str, Dict[str, float]] = {} for prefix, ends in tqdm(nwords_dict.items(), "Converting: "): nwords_float_dict[prefix] = {} total = sum(ends.values()) for e, v in ends.items(): nwords_float_dict[prefix][e] = (v / total) del nwords_dict return nwords_float_dict, words
def parse_file(self, testing_set: TextIO, using_component: bool = False) -> \ List[Tuple[Union[str, List[str]], int, float]]: """ get minus log prob for test set :param using_component: :param testing_set: test set :return: List of tuple (pwd, appearance, minus log prob) """ line_num = wc_l(testing_set) pwd_counter = defaultdict(int) for line in tqdm(testing_set, desc="Reading: ", total=line_num): line = line.strip("\r\n") pwd_counter[line] += 1 res: List[Tuple[Union[str, List[str]], int, float]] = [] for pwd, num in tqdm(pwd_counter.items(), desc="Scoring: "): # type: str, int _mlp, components = self.calc_ml2p(pwd) if using_component: res.append((components, num, _mlp)) else: res.append((pwd, num, _mlp)) res = sorted(res, key=lambda x: x[2]) return res
def nwords_counter(nwords_list: TextIO, n: int, splitter: str, end_chr: str, start4words: int, skip4words: int, start_chr: str = '\x00'): nwords_dict: Dict[Tuple, Dict[str, int]] = defaultdict(lambda: defaultdict(int)) prefix_words_num = n - 1 line_num = wc_l(nwords_list) section_dict = defaultdict(int) words: Dict[str, int] = defaultdict(int) # default_start = start_chr * (n - 1) for line in tqdm(nwords_list, total=line_num, desc="Reading: "): # type: str line = line.strip("\r\n") sections = [start_chr for _ in range(n - 1)] extends = parse_line(line, splitter, start4words, skip4words) sections.extend(extends) sections.append(end_chr) for sec in sections: words[sec] += 1 section_dict[tuple(sections)] += 1 nwords_list.close() for sections, cnt in tqdm(section_dict.items(), desc="Counting: "): for i in range(len(sections) - prefix_words_num): grams = tuple(sections[i:i + prefix_words_num]) transition = sections[i + prefix_words_num] nwords_dict[grams][transition] += cnt del section_dict nwords_float_dict: Dict[Tuple, Dict[str, float]] = {} for prefix, ends in tqdm(nwords_dict.items(), "Converting: "): nwords_float_dict[prefix] = {} total = sum(ends.values()) for e, v in ends.items(): nwords_float_dict[prefix][e] = (v / total) del nwords_dict return nwords_float_dict, words
def backwords_counter(nwords_list: TextIO, splitter: str, start_chr: str, end_chr: str, start4words: int, step4words: int, max_gram: int, threshold: int, nwords_dict: Dict[Tuple, Dict[str, int]] = None, words: Dict[str, int] = None): if nwords_dict is None: nwords_dict: Dict[Tuple, Dict[str, int]] = {} words: Dict[str, int] = {} zero = tuple() if isinstance(nwords_list, list): line_num = len(nwords_list) else: line_num = wc_l(nwords_list) if line_num == 0: print("No passwords for training, early return!", file=sys.stderr) return nwords_dict, words section_dict = defaultdict(lambda: defaultdict(int)) actual_max_gram = 2 for line in tqdm(nwords_list, total=line_num, desc="Reading: "): # type: str line = line.strip("\r\n") sections = [start_chr] sections.extend(parse_line(line, splitter, start4words, step4words)) sections.append(end_chr) for sec in sections: if sec not in words: words[sec] = 0 words[sec] += 1 if sec not in {start_chr}: if zero not in nwords_dict: nwords_dict[zero] = {} if sec not in nwords_dict[zero]: nwords_dict[zero][sec] = 0 nwords_dict[zero][sec] += 1 section_dict[len(sections)][tuple(sections)] += 1 if len(sections) > actual_max_gram: actual_max_gram = len(sections) pass for n in tqdm(range(2, min(max_gram, actual_max_gram) + 1), desc="N-Gram: "): tmp_nwords_dict: Dict[Tuple, Dict[str, int]] = {} for sec_len, sec_len_dict in section_dict.items(): if sec_len < n: continue order = n - 1 for sec, cnt in sec_len_dict.items(): for i in range(0, sec_len - order): prefix = sec[i:i + order] transition = sec[i + order] if prefix not in tmp_nwords_dict: tmp_nwords_dict[prefix] = {} if transition not in tmp_nwords_dict[prefix]: tmp_nwords_dict[prefix][transition] = 0 tmp_nwords_dict[prefix][transition] += cnt pass pass if len(tmp_nwords_dict) == 0: break """ NOTION: Here I assume that we only supply the cracked passwords as secondary training file. According to the assumption above, the model will first remove transitions whose appearance is less than threshold. Therefore, the cracked passwords will never contain the removed transitions. As a result, we can remove these transitions early to save memory. """ for prefix, transitions in tmp_nwords_dict.items(): if prefix not in nwords_dict: if any([cnt >= threshold for cnt in transitions.values()]): nwords_dict[prefix] = transitions continue origin = nwords_dict[prefix] for trans, v in transitions.items(): if trans not in origin: origin[trans] = 0 origin[trans] += v pass return nwords_dict, words
def wrapper(): cli = argparse.ArgumentParser('Backwords secondary main') cli.add_argument("-i", "--training", dest="training", type=argparse.FileType('r'), required=True, help="The training file, each password a line") cli.add_argument("-t", "--testing", dest="testing", type=argparse.FileType('r'), required=True, help="The testing file, each password a line") cli.add_argument("-s", "--save", dest="save", required=True, type=str, help='A folder, results will be saved in this folder') cli.add_argument( "--strategy", dest="strategy", required=True, type=str, nargs="+", # choices=['guesses', 'hits', 'samples'], help= '`guesses <guesses1> <guesses2> ...` means guess number thresholds, ' '`hits <cracked1> <cracked2>` means cracked passwords, ' '`auto_hits <factor> <base> <termination>` means auto generate ' '<cracked1 = factor * base> <cracked2> <cracked2 = factor ** 2 * base>' '`samples <rounds>` means the number of iterations of' 'Monte Carlo simulation') cli.add_argument("--size", dest="size", type=int, required=False, default=100000, help="sample size") cli.add_argument( "--secondary-sample", dest="secondary_sample", type=int, required=False, default=10000000000, help="use some of the cracked passwords for secondary training.") cli.add_argument( "--splitter", dest="splitter", type=str, required=False, default="empty", help="how to divide different columns from the input file, " "set it \"empty\" to represent \'\', \"space\" for \' \', \"tab\" for \'\t\'" ) cli.add_argument( "--start4word", dest="start4words", type=int, required=False, default=0, help= "start index for words, to fit as much as formats of input. An entry per line. " "We get an array of words by splitting the entry. " "\"start4word\" is the index of the first word in the array") cli.add_argument( "--skip4word", dest="skip4words", type=int, required=False, default=1, help="there may be other elements between words, such as tags. " "Set skip4word larger than 1 to skip unwanted elements.") cli.add_argument("--max-gram", dest="max_gram", required=False, type=int, default=256, help="max gram") cli.add_argument( "--threshold", dest="threshold", required=False, type=int, default=10, help="grams whose frequencies less than the threshold will be ignored") cli.add_argument( "--max-iter", dest="max_iter", required=False, default=10**20, type=int, help= "max iteration when calculating the maximum probability of a password") args = cli.parse_args() strategy_value = args.strategy strategy = strategy_value[0] permits = {'guesses', 'hits', 'samples', 'auto_hits'} if strategy not in permits: print(f"strategy should be one of `{', '.join(permits)}`", file=sys.stderr) return if len(strategy_value) < 2: print(f"strategy should have at least 2 values", file=sys.stderr) return using_sample_attack, signs = False, [] upper_bound, hits_upper_bound = 10**14, 10**14 func_thresholds = [] if strategy == 'guesses': print(f"using guesses", file=sys.stderr) values = strategy_value[1:] values = [int(v) for v in values] for i, v in enumerate(values): func_thresholds.append((v, hits_upper_bound)) signs.append(f"guesses-{v:,}") pass elif strategy == 'hits': print(f"using hits", file=sys.stderr) values = strategy_value[1:] values = [int(v) for v in values] for i, v in enumerate(values): func_thresholds.append((upper_bound, v)) signs.append(f"hits-{v:,}") pass elif strategy == 'auto_hits': print(f"using auto_hits", file=sys.stderr) factor, base, termination = int(strategy_value[1]), int( strategy_value[2]), int(strategy_value[3]) end = math.ceil( math.log(termination / max(base, 1)) / math.log(max(factor, 1))) for i, v in enumerate(range(1, end)): nv = (factor**v) * base func_thresholds.append((upper_bound, nv)) signs.append(f"auto_hits-{v:,}") else: print(f"using samples", file=sys.stderr) v = int(strategy_value[1]) func_thresholds = [(upper_bound, hits_upper_bound) for _ in range(v)] signs = [f"samples-{args.size}" for _ in range(v)] using_sample_attack = True pass rounds = len(func_thresholds) splitter_map = {'empty': '', 'space': ' ', 'tab': '\t'} if args.splitter.lower() in splitter_map: args.splitter = splitter_map[args.splitter.lower()] start_chr, end_chr, training_list = '\x03', '\x00', [args.training.name] config = { 'start_chr': start_chr, 'end_chr': end_chr, 'max_gram': args.max_gram, 'threshold': args.threshold, 'training_list': training_list } backwords, words = None, None training = args.training if not os.path.exists(args.save): os.mkdir(args.save) already_cracked = set() print(f"We will have {rounds} rounds", file=sys.stderr, end=', ') cums: List[List[Tuple[str, float, int, int]]] = [] max_guess_numbers = [] for idx in range(rounds): # guess_number_threshold have default value of [args.size, ..., args.size] if it is None func_threshold = func_thresholds[idx] # Therefore, prior_guesses will always be args.size if `--using-samples` print(f"The {idx}-th iteration", file=sys.stderr) cum = [] backwords, words, config, training, max_gn = secondary_cracker( backwords, words, config=config, func_threshold=func_threshold, training=training, splitter=args.splitter, start4words=args.start4words, skip4words=args.skip4words, max_gram=args.max_gram, size=args.size, max_iter=args.max_iter, testing=args.testing, save=args.save, secondary_sample=args.secondary_sample, already_cracked=already_cracked, cum=cum, threshold=args.threshold, sign=signs[idx], using_sample_attack=using_sample_attack, tag=f"iter-{idx}", ) cums.append(cum) max_guess_numbers.append(max_gn) if max_gn >= upper_bound: print( f"Too large guess number reached: {max_gn}, the training process is terminated", file=sys.stderr) break pass backwords, words = backwords_counter(training, splitter=args.splitter, start_chr=start_chr, end_chr=end_chr, start4words=args.start4words, step4words=args.skip4words, max_gram=args.max_gram, nwords_dict=backwords, words=words, threshold=args.threshold) f_final_model = os.path.join(args.save, "final_model.pickle") with open(f_final_model, 'wb') as fout_final_model: pickle.dump((backwords, words, config), file=fout_final_model) print("Training phase done.", file=sys.stderr) backword_mc = BackWordsSecondaryMonteCarlo((backwords, words, config), max_iter=args.max_iter) ml2p_list = backword_mc.sample(size=args.size) mc = MonteCarloLib(ml2p_list) scored_testing = backword_mc.parse_file(args.testing) gc = mc.ml2p_iter2gc(minus_log_prob_iter=scored_testing) # note that this is the cracked passwords obtained according to the final model f_iter_result = os.path.join(args.save, "iter_result.txt") with open(f_iter_result, 'w') as fout_iter_result: cum = [] for pwd, prob, num, gn, cracked, ratio in gc: fout_iter_result.write( f"{pwd}\t{prob:.8f}\t{num}\t{gn}\t{cracked}\t{ratio:5.2f}\n") if pwd not in already_cracked: cum.append((pwd, prob, num, gn)) pass cums.append(cum) pass # note that this is the union of all intermediate results # each guess matters in this result file f_sectional_result = os.path.join(args.save, "sectional_result.txt") with open(f_sectional_result, "w") as fout_sectional_result: _cracked = 0 _total = wc_l(args.testing) for gnt, cum in zip([0, *max_guess_numbers], cums): for (_pwd, _prob, _n, _gn) in cum: _cracked += _n _ratio = _cracked / _total * 100 fout_sectional_result.write( f"{_pwd}\t{_prob:.8f}\t{_n}\t{_gn + gnt}\t{_cracked}\t{_ratio:5.2f}\n" ) pass f_config = os.path.join(args.save, "config.json") with open(f_config, 'w') as fout_config: json.dump(config, fp=fout_config, indent=2) args.testing.close() pass