def create_vocabulary(ngram=1, test=False): """ Creates the vocabulary for ngram level :param ngram: :param test: If true, only runs through the first 10000 documents. :return: Steps: - Get a set of all tokens - Retain only the valid ones """ add_valid_words() # get a set of all tokens of the ngram level print("here") token_set = get_all_tokens_in_docs(ngram, test) print("Total tokens before merging: ", len(token_set)) valid_iterator = valid_ngram_iterator(token_set, ngram) vocabulary_trie = Trie(valid_iterator) vocabulary_trie.save(PATH_TOKENIZED + 'tries/full_vocabulary_{}_grams.trie'.format(ngram)) print("Total tokens after merging", len(vocabulary_trie))
def loadTrie( fname ): global trie try: fname = fname + "_trie.hny" trie.load( fname ) except(IOError): f = bz2.BZ2File( dir_path + sys.argv[1]); words = [ w.strip() for w in f.readlines() ] trie = Trie(words); trie.save(fname);
def add_terms(): for ngram in range(1,3): # update vocabulary trie # this messes up the ids but I don't use them anymore because I don't use the doc-term matrices anymore start = time.time() vocabulary = load_vocabulary_trie(ngram) keys = vocabulary.keys() + ADDED_TOKENS[ngram] vocabulary_new = Trie(keys) vocabulary_new.save(PATH_TOKENIZED + 'tries/full_vocabulary_{}_grams.trie'.format(ngram)) full_db_to_tokens(ngram, add_new_terms=set(ADDED_TOKENS[ngram])) print("adding new tokens for {}-gram took {}.".format(ngram, time.time() - start))
def load_password_blacklist(): global password_blackList if conf.password_blackList == 'NOBLACKLIST': LOGGER.warning('No password blacklist file defined.') password_blackList = Trie() return if os.path.isfile('compiledPwdBlacklist.bin'): LOGGER.info('Loading pre-compiled password blacklist...') password_blackList = Trie() password_blackList.load('compiledPwdBlacklist.bin') else: try: LOGGER.info('Compiling password blacklist...') with open(conf.password_blackList, encoding="utf-8") as f: pwds = f.read().splitlines() password_blackList = Trie(pwds) password_blackList.save('compiledPwdBlacklist.bin') except FileNotFoundError: LOGGER.error('File ' + conf.password_blackList + ' not found. Aborting.') exit(-1)
def craft_index(wordlist: List[str], output_dir: Path) -> Path: """Generate the special file "words" that is an index of all words.""" output = output_dir / "words" trie = Trie(wordlist) trie.save(output) return output
# coding: utf-8 import sys from marisa_trie import Trie patterns = [] for line in iter(sys.stdin.readline, ""): #if not isinstance(line, unicode): # line = line.decode('utf-8') ptn = line.strip().replace('_', ' ').lower() if len(line) == 0: continue patterns.append(ptn) trie = Trie(patterns) trie.save('triedict')