Пример #1
0
import csv
import os
import sys
from concurrent.futures.thread import ThreadPoolExecutor
from os import listdir
from os.path import isfile, join
from time import time
from word2keypress import Keyboard

kb = Keyboard()

# The function is_valid_password filters password in this way:
# - removes passwords longer than 30 characters or shorter than 4.
# - removes non ASCII printable characters in a password
# - removes bot, which are recognisable by the same mail used more than 100 times.
# - removes HEX passwords (identified by $HEX[]) and \x
# - removes HTML char set


def is_valid_password(password):
    pass_len = len(password)
    return 4 < pass_len < 30 and password.isascii() and password.isprintable() and not password.startswith('\\x') \
           and '$HEX' not in password and '&lt' not in password and '&gt' not in password \
           and '&le' not in password and '&ge' not in password and '&#' not in password \
           and '&amp' not in password


# The function filter_file filters source, removing mail in this way:
# - mail which appear more than 100 times and less than 2.
# - mail with non-valid password
Пример #2
0
from pathlib import Path
import pandas as pd
import numpy as np
from word2keypress import Keyboard
import pdb
import gensim
import numpy as np
from numpy import dot
from gensim.models.utils_any2vec import _save_word2vec_format, _load_word2vec_format, _compute_ngrams, _ft_hash
from numpy import dot
import numpy as np
import math
from gensim import utils, matutils

  
KB = Keyboard()

model = gensim.models.Word2Vec.load('/hdd/c3s/models/fastText2_keyseq_mincount:10_ngram:1-4_negsamp:5_subsamp:0.001_d:100')
model.init_sims()
def get_vector_ngram(word):
    word_vec = np.zeros(model.wv.vectors_ngrams.shape[1], dtype=np.float32)
  
    ngrams = _compute_ngrams(word, model.wv.min_n, model.wv.max_n)
    ngrams_found = 0
    
    for ngram in ngrams:
        ngram_hash = _ft_hash(ngram) % model.wv.bucket
        if ngram_hash in model.wv.hash2index:
            word_vec += model.wv.vectors_ngrams_norm[model.wv.hash2index[ngram_hash]]
        
            ngrams_found += 1
Пример #3
0
import numpy as np

ALLOWED_CHARACTERS = string.ascii_letters + string.digits + string.punctuation + ' '  # removed tab

NOTSHIFT_2_SHIFT_MAP = dict(
    zip(b'`1234567890-=[]\;\',./', b'~!@#$%^&*()_+{}|:"<>?'))
SHIFT_2_NOTSHIFT_MAP = dict(
    zip(b'~!@#$%^&*()_+{}|:"<>?', b'`1234567890-=[]\;\',./'))
SHIFT_SWITCH_MAP = dict(
    zip('`1234567890-=[]\;\',./~!@#$%^&*()_+{}|:"<>?',
        '~!@#$%^&*()_+{}|:"<>?`1234567890-=[]\;\',./'))
SYMDIGIT_re = re.compile(r'(?P<last>(%s)+)$' %
                         '|'.join(map(re.escape, SHIFT_SWITCH_MAP.keys())))

allowed_keys = np.array(list(ALLOWED_KEYS))
KB = Keyboard('US')
"""This is the set of correctors we consider.  A corrector is a
function which tries to fix a typo in a password but applying some
generic modifications. By generic I meant the modification is not
dependent on the specific password it is modifying.  

Every corrector comes with an inverse function which defines the
mistyped password or passwords that this corrector can fix. e.g.,
if "adding a 1 to the end" be a corrector, then removing the last
one will be the typo version of that corrector.

In the EDITS_NAME_FUNC_MAP, the first entry is the corrector
function while the second entry is the typo (inverse of the
corrector) of that function.