import csv import os import sys from concurrent.futures.thread import ThreadPoolExecutor from os import listdir from os.path import isfile, join from time import time from word2keypress import Keyboard kb = Keyboard() # The function is_valid_password filters password in this way: # - removes passwords longer than 30 characters or shorter than 4. # - removes non ASCII printable characters in a password # - removes bot, which are recognisable by the same mail used more than 100 times. # - removes HEX passwords (identified by $HEX[]) and \x # - removes HTML char set def is_valid_password(password): pass_len = len(password) return 4 < pass_len < 30 and password.isascii() and password.isprintable() and not password.startswith('\\x') \ and '$HEX' not in password and '<' not in password and '>' not in password \ and '&le' not in password and '&ge' not in password and '&#' not in password \ and '&' not in password # The function filter_file filters source, removing mail in this way: # - mail which appear more than 100 times and less than 2. # - mail with non-valid password
from pathlib import Path import pandas as pd import numpy as np from word2keypress import Keyboard import pdb import gensim import numpy as np from numpy import dot from gensim.models.utils_any2vec import _save_word2vec_format, _load_word2vec_format, _compute_ngrams, _ft_hash from numpy import dot import numpy as np import math from gensim import utils, matutils KB = Keyboard() model = gensim.models.Word2Vec.load('/hdd/c3s/models/fastText2_keyseq_mincount:10_ngram:1-4_negsamp:5_subsamp:0.001_d:100') model.init_sims() def get_vector_ngram(word): word_vec = np.zeros(model.wv.vectors_ngrams.shape[1], dtype=np.float32) ngrams = _compute_ngrams(word, model.wv.min_n, model.wv.max_n) ngrams_found = 0 for ngram in ngrams: ngram_hash = _ft_hash(ngram) % model.wv.bucket if ngram_hash in model.wv.hash2index: word_vec += model.wv.vectors_ngrams_norm[model.wv.hash2index[ngram_hash]] ngrams_found += 1
import numpy as np ALLOWED_CHARACTERS = string.ascii_letters + string.digits + string.punctuation + ' ' # removed tab NOTSHIFT_2_SHIFT_MAP = dict( zip(b'`1234567890-=[]\;\',./', b'~!@#$%^&*()_+{}|:"<>?')) SHIFT_2_NOTSHIFT_MAP = dict( zip(b'~!@#$%^&*()_+{}|:"<>?', b'`1234567890-=[]\;\',./')) SHIFT_SWITCH_MAP = dict( zip('`1234567890-=[]\;\',./~!@#$%^&*()_+{}|:"<>?', '~!@#$%^&*()_+{}|:"<>?`1234567890-=[]\;\',./')) SYMDIGIT_re = re.compile(r'(?P<last>(%s)+)$' % '|'.join(map(re.escape, SHIFT_SWITCH_MAP.keys()))) allowed_keys = np.array(list(ALLOWED_KEYS)) KB = Keyboard('US') """This is the set of correctors we consider. A corrector is a function which tries to fix a typo in a password but applying some generic modifications. By generic I meant the modification is not dependent on the specific password it is modifying. Every corrector comes with an inverse function which defines the mistyped password or passwords that this corrector can fix. e.g., if "adding a 1 to the end" be a corrector, then removing the last one will be the typo version of that corrector. In the EDITS_NAME_FUNC_MAP, the first entry is the corrector function while the second entry is the typo (inverse of the corrector) of that function.