def __init__(self, index, lm, input, input_type, binary_interval, decode_mode=None): """Initialise the sentence either from text or from an interval.""" assert (input_type in ("text", "interval")) assert (binary_interval in (None, "sub", "super")) self.index = index self.lm = lm if input_type == "text": text = input raw_token_strings = text2token_strings(text) self.token_strings = normalise_and_explode_tokens( raw_token_strings) self.token_indices = tuple(map(self.index.s2i, self.token_strings)) self.interval = encode(lm.conditional_interval, self.token_indices) if binary_interval: self.bits = interval2bits(self.interval, binary_interval) elif input_type == "interval": assert (decode_mode in ("deep", "shallow")) start_interval = input if decode_mode == "deep": decoding_result = deep_decode(lm.next, start_interval, end=self.index.s2i("_END_")) elif decode_mode == "shallow": decoding_result = decode(lm.next, start_interval) self.token_indices = decoding_result.sequence self.token_strings = tuple(map(self.index.i2s, self.token_indices)) self.interval = decoding_result.interval if binary_interval: self.bits = interval2bits(self.interval, binary_interval)
help="normalise and explode tokens") args = parser.parse_args() # Load the index if args.index: print_status("Started loading index from", args.index) with open(args.index, "r") as f: index = bindb.BinDBIndex(f) print_status("Finished loading index") while True: try: text = input('--> ') except KeyboardInterrupt: print() break token_strings = text2token_strings(text) if args.normalise: token_strings = normalise_and_explode_tokens(token_strings) print(" ".join(token_strings)) if args.index: try: token_indices = tuple(map(index.s2i, token_strings)) print(token_indices) except KeyError as e: print("KeyError: {e} is not in the index".format(**locals()))
offset = 0 # Load language model print("Loading language model...") lm = bindb.BinDBLM( "/Users/kkom/Desktop/bindb-normalised/counts-consistent-tables", n, start, end, beta, gamma, offset) # Load index print("Loading words index...") with open("/Users/kkom/Desktop/bindb-normalised/index", "r") as f: index = bindb.BinDBIndex(f) # Invent plaintext plaintext = "who do you miss the most?" plaintext_strings = normalise_and_explode_tokens(text2token_strings(plaintext)) plaintext_indices = tuple(map(index.s2i, plaintext_strings)) print("Plaintext: " + plaintext, end="\n\n") print("Plaintext token strings: " + str(plaintext_strings), end="\n\n") print("Plaintext token indices: " + str(plaintext_indices), end="\n\n") # Invent a password password = "******" password_strings = normalise_and_explode_tokens(text2token_strings(password)) password_indices = tuple(map(index.s2i, password_strings)) print("Password: "******"\n\n") print("Password token strings: " + str(password_strings), end="\n\n") print("Password token indices: " + str(password_indices), end="\n\n")
print("gamma: {gamma}".format(**locals())) print("offset: {offset}".format(**locals())) print() # Load language model lm = bindb.BinDBLM( "/Users/kkom/Desktop/bindb-normalised/counts-consistent-tables", n, start, end, beta, gamma, offset) # Load index with open("/Users/kkom/Desktop/bindb-normalised/index", "r") as f: index = bindb.BinDBIndex(f) # Invent a sentence text = "Hey! What the f**k is going on here?" token_strings = normalise_and_explode_tokens(text2token_strings(text)) token_indices = tuple(map(index.s2i, token_strings)) print(text) print() print(" ".join(token_strings)) print() print(token_indices) print() # Get the next token after "is" given some intervals context = token_indices[:9] intervals = (create_interval(0, 1, 1000000), create_interval(345246, 56, 1000000), create_interval(5465477, 322, 10000000),