예제 #1
0
def test():
    nwmc = NWords2MonteCarlo(
        open("/home/cw/Documents/Experiments/SegLab/NWords/csdn-rded.txt"), 4)
    ml2p_list = nwmc.sample()
    mc = MonteCarloLib(ml2p_list)
    ipt = ""
    while ipt != "exit":
        ipt = input("Enter password: ")
        ml2p = nwmc.calc_ml2p(ipt)
        print(ml2p)
        print(mc.ml2p2rank(ml2p))
    pass
예제 #2
0
def test():
    bpePcfg = BpePcfgSim(model_path="/home/cw/Documents/tmp/model")
    samples = bpePcfg.sample(size=1000000)
    monte_carlo = MonteCarloLib(minus_log_prob_list=samples)
    while True:
        pwd = input("type in a password: "******"pwd: {pwd}, prob: {2 ** (-prob)}", end=", ")
        rank = monte_carlo.ml2p2rank(prob)
        print(f"rank: {rank}")
    pass
예제 #3
0
def wrapper():
    cli = argparse.ArgumentParser("N words simulator")
    cli.add_argument("-i", "--input", dest="input", type=argparse.FileType('r'), required=True, help="nwords file")
    cli.add_argument("-t", "--test", dest="test", type=argparse.FileType('r'), required=True, help="testing file")
    cli.add_argument("-s", "--save", dest="save", type=argparse.FileType('w'), required=True,
                     help="save Monte Carlo results here")
    cli.add_argument("-n", "--ngram", dest="ngram", type=int, required=False, default=2, choices=[2, 3, 4, 5, 6],
                     help="ngram")
    cli.add_argument("--size", dest="size", type=int, required=False, default=100000, help="sample size")
    cli.add_argument("--splitter", dest="splitter", type=lambda x: str(x).replace("\\\\", "\\"), required=False,
                     default="\t",
                     help="how to divide different columns from the input file. "
                          "Set it \"empty\" to represent \'\'")
    cli.add_argument("--start4word", dest="start4word", type=int, required=False, default=0,
                     help="start index for words, to fit as much as formats of input. An entry per line. "
                          "We get an array of words by splitting the entry. "
                          "\"start4word\" is the index of the first word in the array")
    cli.add_argument("--skip4word", dest="skip4word", type=int, required=False, default=1,
                     help="there may be other elements between words, such as tags. "
                          "Set skip4word larger than 1 to skip unwanted elements.")
    args = cli.parse_args()
    if args.splitter == 'empty':
        args.splitter = ''
    nword_mc = NWordsMonteCarlo(args.input, splitter=args.splitter, n=args.ngram, start4word=args.start4word,
                                skip4word=args.skip4word)
    ml2p_list = nword_mc.sample(size=args.size)
    mc = MonteCarloLib(ml2p_list)
    scored_testing = nword_mc.parse_file(args.test)
    mc.ml2p_iter2gc(minus_log_prob_iter=scored_testing)
    mc.write2(args.save)
    pass
예제 #4
0
def main():
    cli = argparse.ArgumentParser("NWords v2")
    cli.add_argument("-f",
                     "--file",
                     dest="training",
                     required=True,
                     type=argparse.FileType("r"),
                     help="training set")
    cli.add_argument("-t",
                     "--target",
                     dest="testing",
                     required=True,
                     type=argparse.FileType("r"),
                     help="testing set")
    cli.add_argument("-s",
                     "--save",
                     dest="save",
                     required=False,
                     default=sys.stdout,
                     type=argparse.FileType("w"),
                     help="save results")
    args = cli.parse_args()
    nwmc = NWords2MonteCarlo(args.training, 4)
    ml2p_list = nwmc.sample()
    mc = MonteCarloLib(ml2p_list)
    scored_testing = nwmc.parse_file(args.testing)
    mc.ml2p_iter2gc(minus_log_prob_iter=scored_testing)
    mc.write2(args.save)
예제 #5
0
def wrapper():
    cli = argparse.ArgumentParser("Backoff words simulator")
    cli.add_argument("-m",
                     "--model",
                     dest="model",
                     type=argparse.FileType('rb'),
                     required=True,
                     help="trained model")
    cli.add_argument("-t",
                     "--test",
                     dest="test",
                     type=argparse.FileType('r'),
                     required=True,
                     help="testing file")
    cli.add_argument("-s",
                     "--save",
                     dest="save",
                     type=argparse.FileType('w'),
                     required=True,
                     help="save Monte Carlo results here")
    cli.add_argument("--size",
                     dest="size",
                     type=int,
                     required=False,
                     default=100000,
                     help="sample size")
    cli.add_argument(
        "--debug-mode",
        dest="debug_mode",
        required=False,
        action="store_true",
        help="enter passwords and show probability of the password")
    cli.add_argument(
        "--max-iter",
        dest="max_iter",
        required=False,
        default=10**20,
        type=int,
        help=
        "max iteration when calculating the maximum probability of a password")
    args = cli.parse_args()
    backword_mc = BackWordsSecondaryMonteCarlo(args.model,
                                               max_iter=args.max_iter)
    if args.debug_mode:
        usr_i = ""
        while usr_i != "exit":
            usr_i = input("type in passwords: ")
            prob, components = backword_mc.calc_ml2p(usr_i)
            print(prob)
        return
    ml2p_list = backword_mc.sample(size=args.size)
    mc = MonteCarloLib(ml2p_list)
    scored_testing = backword_mc.parse_file(args.test)
    mc.ml2p_iter2gc(minus_log_prob_iter=scored_testing)
    mc.write2(args.save)
예제 #6
0
def wrapper(model_path: str,
            testing_set: TextIO,
            save2: TextIO,
            size: int = 1000000):
    # "/home/cw/Documents/tmp/model"
    bpePcfg = BpePcfgSim(model_path=model_path)
    samples = bpePcfg.sample(size=size)
    # open("/home/cw/Documents/tmp/178_new.txt")
    scored = bpePcfg.parse_file(testing_set)
    monte_carlo = MonteCarloLib(minus_log_prob_list=samples)
    monte_carlo.ml2p_iter2gc(scored, need_resort=True, add1=True)
    # open("/home/cw/Documents/tmp/scored_178.txt", "w")
    monte_carlo.write2(save2)

    pass
예제 #7
0
def wrapper():
    cli = argparse.ArgumentParser("Backoff words simulator")
    cli.add_argument("-i", "--input", dest="input", type=argparse.FileType('r'), required=True, help="nwords file")
    cli.add_argument("-t", "--test", dest="test", type=argparse.FileType('r'), required=True, help="testing file")
    cli.add_argument("-s", "--save", dest="save", type=argparse.FileType('w'), required=True,
                     help="save Monte Carlo results here")
    cli.add_argument("--size", dest="size", type=int, required=False, default=100000, help="sample size")
    cli.add_argument("--splitter", dest="splitter", type=str, required=False, default="empty",
                     help="how to divide different columns from the input file, "
                          "set it \"empty\" to represent \'\', \"space\" for \' \', \"tab\" for \'\t\'")
    cli.add_argument("--start4word", dest="start4word", type=int, required=False, default=0,
                     help="start index for words, to fit as much as formats of input. An entry per line. "
                          "We get an array of words by splitting the entry. "
                          "\"start4word\" is the index of the first word in the array")
    cli.add_argument("--skip4word", dest="skip4word", type=int, required=False, default=1,
                     help="there may be other elements between words, such as tags. "
                          "Set skip4word larger than 1 to skip unwanted elements.")
    cli.add_argument("--threshold", dest="threshold", required=False, type=int, default=10,
                     help="grams whose frequencies less than the threshold will be ignored")
    cli.add_argument("--debug-mode", dest="debug_mode", required=False, action="store_true",
                     help="enter passwords and show probability of the password")
    cli.add_argument("--max-gram", dest="max_gram", required=False, type=int, default=256, help="max gram")
    cli.add_argument("--max-iter", dest="max_iter", required=False, default=10 ** 20, type=int,
                     help="max iteration when calculating the maximum probability of a password")
    args = cli.parse_args()
    splitter_map = {'empty': '', 'space': ' ', 'tab': '\t'}
    if args.splitter.lower() in splitter_map:
        args.splitter = splitter_map[args.splitter.lower()]
    backword_mc = BackWordsMonteCarlo(args.input, splitter=args.splitter, start4word=args.start4word,
                                      skip4word=args.skip4word,
                                      threshold=args.threshold, max_gram=args.max_gram, max_iter=args.max_iter)
    if args.debug_mode:
        usr_i = ""
        while usr_i != "exit":
            usr_i = input("type in passwords: ")
            prob, components = backword_mc.calc_ml2p(usr_i)
            print(prob)
        return
    ml2p_list = backword_mc.sample(size=args.size)
    mc = MonteCarloLib(ml2p_list)
    scored_testing = backword_mc.parse_file(args.test)
    mc.ml2p_iter2gc(minus_log_prob_iter=scored_testing)
    mc.write2(args.save)
예제 #8
0
def secondary_cracker(backwords, words, config,
                      func_threshold: Tuple[int, int], **kwargs):
    save_in_folder = kwargs['save']
    tag = kwargs['tag']
    nwords_dict, _words = backwords_counter(nwords_list=kwargs['training'],
                                            splitter=kwargs['splitter'],
                                            start_chr=config['start_chr'],
                                            end_chr=config['end_chr'],
                                            start4words=kwargs['start4words'],
                                            step4words=kwargs['skip4words'],
                                            max_gram=kwargs['max_gram'],
                                            nwords_dict=backwords,
                                            words=words,
                                            threshold=kwargs['threshold'])
    fmodel = os.path.join(save_in_folder, f"model-to-crack-{tag}.pickle")
    with open(fmodel, 'wb') as fd:
        sign = kwargs['sign']
        config['training_list'].append(f"{sign}")
        pickle.dump((nwords_dict, words, config), file=fd)
    backword_mc = BackWordsSecondaryMonteCarlo((nwords_dict, _words, config),
                                               max_iter=kwargs['max_iter'])
    # Note: this part is to "generate" some guesses and crack passwords in the testing dataset
    #
    # Besides, here we allow the user to provide a list which holds the sampled passwords
    # Then, we calculate the intersection of the sampled passwords and the testing dataset to
    # obtain the cracked passwords
    using_sample_attack = kwargs['using_sample_attack']
    sampled_pwds = None
    if using_sample_attack:
        sampled_pwds = {}
    ml2p_list = backword_mc.sample(size=kwargs['size'],
                                   sampled_pwds=sampled_pwds)
    if using_sample_attack:
        f_samples = os.path.join(save_in_folder, f"samples-{tag}.txt")
        with open(f_samples, 'w') as fout_samples:
            sidx = 1
            for pwd, (prob, cnt) in sorted(sampled_pwds.items(),
                                           key=lambda x: x[1][0]):
                fout_samples.write(f"{pwd}\t{prob:.8f}\t{cnt}\n")
                sampled_pwds[pwd] = sidx
                sidx += cnt
        pass
    mc = MonteCarloLib(ml2p_list)
    scored_testing = backword_mc.parse_file(kwargs['testing'],
                                            using_component=True)
    gc = mc.ml2p_iter2gc(minus_log_prob_iter=scored_testing)
    secondary_training = []
    fcracked = os.path.join(save_in_folder, f"cracked-{tag}.txt")
    already_cracked = kwargs['already_cracked']
    cum: List[Tuple[str, float, int, int]] = kwargs['cum']
    with open(fcracked, 'w') as fout:
        unique, max_gn = 0, 0
        gn_upper_bound, hits_upper_bound = func_threshold
        for pwd, prob, num, gn, cracked, ratio in gc:
            _pwd = kwargs['splitter'].join(pwd)
            if _pwd in already_cracked:
                continue
            valid1 = (using_sample_attack and _pwd in sampled_pwds)
            if valid1:
                gn = sampled_pwds[_pwd]
            if valid1 or (not using_sample_attack and
                          (gn < gn_upper_bound and unique < hits_upper_bound)):
                unique += 1
                max_gn = max(max_gn, gn)
                secondary_training.extend([_pwd] * num)
                cum.append((_pwd, prob, num, gn))
                fout.write(f"{_pwd}\t{prob:.8f}\t{num}\t{gn}\n")
                already_cracked.add(_pwd)
        pass
    secondary_sample_size = kwargs['secondary_sample']
    if secondary_sample_size < len(secondary_training):
        print(
            f"We sample {secondary_sample_size} passwords to perform secondary training in the next round",
            file=sys.stderr)
        fsample = os.path.join(save_in_folder, f"sampled-{tag}.txt")
        secondary_training = random.sample(secondary_training,
                                           secondary_sample_size)
        with open(fsample, 'w') as fout:
            for pwd in secondary_training:
                fout.write(f"{pwd}\n")
            pass
    return nwords_dict, _words, config, secondary_training, max_gn
예제 #9
0
def wrapper():
    cli = argparse.ArgumentParser('Backwords secondary main')
    cli.add_argument("-i",
                     "--training",
                     dest="training",
                     type=argparse.FileType('r'),
                     required=True,
                     help="The training file, each password a line")
    cli.add_argument("-t",
                     "--testing",
                     dest="testing",
                     type=argparse.FileType('r'),
                     required=True,
                     help="The testing file, each password a line")
    cli.add_argument("-s",
                     "--save",
                     dest="save",
                     required=True,
                     type=str,
                     help='A folder, results will be saved in this folder')
    cli.add_argument(
        "--strategy",
        dest="strategy",
        required=True,
        type=str,
        nargs="+",
        # choices=['guesses', 'hits', 'samples'],
        help=
        '`guesses <guesses1> <guesses2> ...` means guess number thresholds, '
        '`hits <cracked1> <cracked2>` means cracked passwords, '
        '`auto_hits <factor> <base> <termination>` means auto generate '
        '<cracked1 = factor * base> <cracked2> <cracked2 = factor ** 2 * base>'
        '`samples <rounds>` means the number of iterations of'
        'Monte Carlo simulation')
    cli.add_argument("--size",
                     dest="size",
                     type=int,
                     required=False,
                     default=100000,
                     help="sample size")
    cli.add_argument(
        "--secondary-sample",
        dest="secondary_sample",
        type=int,
        required=False,
        default=10000000000,
        help="use some of the cracked passwords for secondary training.")
    cli.add_argument(
        "--splitter",
        dest="splitter",
        type=str,
        required=False,
        default="empty",
        help="how to divide different columns from the input file, "
        "set it \"empty\" to represent \'\', \"space\" for \' \', \"tab\" for \'\t\'"
    )
    cli.add_argument(
        "--start4word",
        dest="start4words",
        type=int,
        required=False,
        default=0,
        help=
        "start index for words, to fit as much as formats of input. An entry per line. "
        "We get an array of words by splitting the entry. "
        "\"start4word\" is the index of the first word in the array")
    cli.add_argument(
        "--skip4word",
        dest="skip4words",
        type=int,
        required=False,
        default=1,
        help="there may be other elements between words, such as tags. "
        "Set skip4word larger than 1 to skip unwanted elements.")
    cli.add_argument("--max-gram",
                     dest="max_gram",
                     required=False,
                     type=int,
                     default=256,
                     help="max gram")
    cli.add_argument(
        "--threshold",
        dest="threshold",
        required=False,
        type=int,
        default=10,
        help="grams whose frequencies less than the threshold will be ignored")
    cli.add_argument(
        "--max-iter",
        dest="max_iter",
        required=False,
        default=10**20,
        type=int,
        help=
        "max iteration when calculating the maximum probability of a password")
    args = cli.parse_args()
    strategy_value = args.strategy
    strategy = strategy_value[0]
    permits = {'guesses', 'hits', 'samples', 'auto_hits'}
    if strategy not in permits:
        print(f"strategy should be one of `{', '.join(permits)}`",
              file=sys.stderr)
        return
    if len(strategy_value) < 2:
        print(f"strategy should have at least 2 values", file=sys.stderr)
        return

    using_sample_attack, signs = False, []
    upper_bound, hits_upper_bound = 10**14, 10**14
    func_thresholds = []
    if strategy == 'guesses':
        print(f"using guesses", file=sys.stderr)
        values = strategy_value[1:]
        values = [int(v) for v in values]
        for i, v in enumerate(values):
            func_thresholds.append((v, hits_upper_bound))
            signs.append(f"guesses-{v:,}")
        pass
    elif strategy == 'hits':
        print(f"using hits", file=sys.stderr)
        values = strategy_value[1:]
        values = [int(v) for v in values]
        for i, v in enumerate(values):
            func_thresholds.append((upper_bound, v))
            signs.append(f"hits-{v:,}")
        pass
    elif strategy == 'auto_hits':
        print(f"using auto_hits", file=sys.stderr)
        factor, base, termination = int(strategy_value[1]), int(
            strategy_value[2]), int(strategy_value[3])
        end = math.ceil(
            math.log(termination / max(base, 1)) / math.log(max(factor, 1)))
        for i, v in enumerate(range(1, end)):
            nv = (factor**v) * base

            func_thresholds.append((upper_bound, nv))
            signs.append(f"auto_hits-{v:,}")
    else:
        print(f"using samples", file=sys.stderr)
        v = int(strategy_value[1])
        func_thresholds = [(upper_bound, hits_upper_bound) for _ in range(v)]
        signs = [f"samples-{args.size}" for _ in range(v)]
        using_sample_attack = True
        pass
    rounds = len(func_thresholds)
    splitter_map = {'empty': '', 'space': ' ', 'tab': '\t'}
    if args.splitter.lower() in splitter_map:
        args.splitter = splitter_map[args.splitter.lower()]
    start_chr, end_chr, training_list = '\x03', '\x00', [args.training.name]
    config = {
        'start_chr': start_chr,
        'end_chr': end_chr,
        'max_gram': args.max_gram,
        'threshold': args.threshold,
        'training_list': training_list
    }
    backwords, words = None, None
    training = args.training
    if not os.path.exists(args.save):
        os.mkdir(args.save)
    already_cracked = set()

    print(f"We will have {rounds} rounds", file=sys.stderr, end=', ')
    cums: List[List[Tuple[str, float, int, int]]] = []
    max_guess_numbers = []
    for idx in range(rounds):
        # guess_number_threshold have default value of [args.size, ..., args.size] if it is None
        func_threshold = func_thresholds[idx]
        # Therefore, prior_guesses will always be args.size if `--using-samples`
        print(f"The {idx}-th iteration", file=sys.stderr)
        cum = []
        backwords, words, config, training, max_gn = secondary_cracker(
            backwords,
            words,
            config=config,
            func_threshold=func_threshold,
            training=training,
            splitter=args.splitter,
            start4words=args.start4words,
            skip4words=args.skip4words,
            max_gram=args.max_gram,
            size=args.size,
            max_iter=args.max_iter,
            testing=args.testing,
            save=args.save,
            secondary_sample=args.secondary_sample,
            already_cracked=already_cracked,
            cum=cum,
            threshold=args.threshold,
            sign=signs[idx],
            using_sample_attack=using_sample_attack,
            tag=f"iter-{idx}",
        )
        cums.append(cum)
        max_guess_numbers.append(max_gn)
        if max_gn >= upper_bound:
            print(
                f"Too large guess number reached: {max_gn}, the training process is terminated",
                file=sys.stderr)
            break
        pass
    backwords, words = backwords_counter(training,
                                         splitter=args.splitter,
                                         start_chr=start_chr,
                                         end_chr=end_chr,
                                         start4words=args.start4words,
                                         step4words=args.skip4words,
                                         max_gram=args.max_gram,
                                         nwords_dict=backwords,
                                         words=words,
                                         threshold=args.threshold)
    f_final_model = os.path.join(args.save, "final_model.pickle")
    with open(f_final_model, 'wb') as fout_final_model:
        pickle.dump((backwords, words, config), file=fout_final_model)
    print("Training phase done.", file=sys.stderr)
    backword_mc = BackWordsSecondaryMonteCarlo((backwords, words, config),
                                               max_iter=args.max_iter)
    ml2p_list = backword_mc.sample(size=args.size)
    mc = MonteCarloLib(ml2p_list)
    scored_testing = backword_mc.parse_file(args.testing)
    gc = mc.ml2p_iter2gc(minus_log_prob_iter=scored_testing)
    # note that this is the cracked passwords obtained according to the final model
    f_iter_result = os.path.join(args.save, "iter_result.txt")
    with open(f_iter_result, 'w') as fout_iter_result:
        cum = []
        for pwd, prob, num, gn, cracked, ratio in gc:
            fout_iter_result.write(
                f"{pwd}\t{prob:.8f}\t{num}\t{gn}\t{cracked}\t{ratio:5.2f}\n")
            if pwd not in already_cracked:
                cum.append((pwd, prob, num, gn))
            pass
        cums.append(cum)
        pass
    # note that this is the union of all intermediate results
    # each guess matters in this result file
    f_sectional_result = os.path.join(args.save, "sectional_result.txt")
    with open(f_sectional_result, "w") as fout_sectional_result:
        _cracked = 0
        _total = wc_l(args.testing)
        for gnt, cum in zip([0, *max_guess_numbers], cums):
            for (_pwd, _prob, _n, _gn) in cum:
                _cracked += _n
                _ratio = _cracked / _total * 100
                fout_sectional_result.write(
                    f"{_pwd}\t{_prob:.8f}\t{_n}\t{_gn + gnt}\t{_cracked}\t{_ratio:5.2f}\n"
                )
        pass
    f_config = os.path.join(args.save, "config.json")
    with open(f_config, 'w') as fout_config:
        json.dump(config, fp=fout_config, indent=2)
    args.testing.close()
    pass