def fit(self, X, Y, **kwargs): """Fit the internal keras classifier given as input in the constructor. Arguments: X (numpy ndarray) : training data Y (numpy ndarray) : training label Raises: TypeError: X, Y not numpy ndarray ModelNotLoadedError: calling function without having loaded or passed model as arg Returns: self """ if self._keras_classifier is None: raise ModelNotLoadedError() type_check(X, np.ndarray, "X") type_check(Y, np.ndarray, "Y") try: self._keras_classifier.fit(X, Y, **kwargs) return self except Exception as e: raise KerasInternalError( "Keras internal error. See inner exception for details." ) from e
def __init__(self, filepath: str): """Constructs model by loading pretrained net. Arguments: filepath (str) : the path to the pretrained h5 net Raises: TypeError: filepath not string FileNotFoundError: filepath not pointing to anything NotKerasModelError: filepath not pointing to h5 keras model """ type_check(filepath, str, "filepath") from wafamole.models.custom.pytorch_models.ModelClass import SentimentLSTM self.filepath = filepath p = re.compile('.*ModelWAF(\d+).*') model_num = p.findall(filepath) self.model_number = model_num[0] self.vocabfile = './vocab' + self.model_number + '.json' f = open(self.vocabfile) self.vocab_to_int = json.load(f) vocab_size = len(self.vocab_to_int) + 1 # +1 for the 0 padding output_size = 1 embedding_dim = 100 hidden_dim = 32 n_layers = 2 net = SentimentLSTM(vocab_size, output_size, embedding_dim, hidden_dim, n_layers) self.load_model(filepath, net) super(PyTorchExample, self).__init__(self._pytorch_classifier)
def __init__(self, model: Model): """Initialize an evasion object. Arguments: model: the input model to evaluate Raises: TypeError: model is not Model """ type_check(model, Model, "model") super(EvasionEngine, self).__init__(model)
def replace_nth(candidate, sub, wanted, n): """Replace the n-th occurrence of a portion of the candidate with wanted. Arguments: candidate (str) : the string to be modified sub (str) : regexp containing what to substitute wanted (str) : the string that will replace sub n (int) : the index of the occurrence to replace Raises: TypeError : bad type passed as arguments Returns: (str) : the modified string """ type_check(candidate, str, "candidate") type_check(sub, str, "sub") type_check(wanted, str, "wanted") type_check(n, int, "n") where = [m.start() for m in re.finditer(re.escape(sub), candidate)][n - 1] before = candidate[:where] after = candidate[where:] after = after.replace(sub, wanted, 1) result = before + after return result
def extract_features(self, value: str): """No feature extraction Arguments: value (str) : input query string Raises: TypeError: value is not string Returns: str : the input value """ type_check(value, str, "value") return value
def __init__(self, filepath: str): """Constructs model by loading pretrained net. Arguments: filepath (str) : the path to the pretrained h5 net Raises: TypeError: filepath not string FileNotFoundError: filepath not pointing to anything NotKerasModelError: filepath not pointing to h5 keras model """ type_check(filepath, str, "filepath") file_exists(filepath) self.load(filepath) super(WafBrainWrapper, self).__init__(self._keras_classifier)
def classify(self, value: str): """Produce probability of being sql injection. Arguments: value (str) : input query Raises: TypeError: value is not string Returns: float : probability of being a sql injection """ type_check(value, str, "value") malicious = process_payload(self._keras_classifier, "", [value])["score"] return malicious
def filter_candidates(symbols, payload): """It removes all the symbols that are not contained inside the input payload string. Arguments: symbols (dict) : dictionary of symbols to filter (using the key) payload (str) : the payload to use for the filtering Raises: TypeError : bad types passed as argument Returns: list : a list containing all the symbols that are contained inside the payload. """ type_check(symbols, dict, "symbols") type_check(payload, str, "payload") return [s for s in symbols.keys() if s in payload]
def random_char(spaces=True): """Returns a random character. Keyword Arguments: spaces (bool) : include spaces [default = True] Raises: TypeError: spaces not bool Returns: str : random character """ type_check(spaces, bool, "spaces") chars = list(string.printable) chars_no_space = [c for c in chars if c not in string.whitespace] return random.choice(chars if spaces else chars_no_space)
def random_string(max_len=5, spaces=True): """It creates a random string. Keyword Arguments: max_length (int) : the maximum length of the string [default=5] spaces (bool) : if True, all the printable character will be considered. Else, only letters and digits [default=True] Raises: TypeError: bad type passed as argument Returns: (str) : random string """ type_check(max_len, int, "max_length") type_check(spaces, bool, "spaces") return "".join( [random_char(spaces=spaces) for i in range(random.randint(1, max_len))] )
def load_model(self, filepath, ModelClass): """Loads a PyTorch classifier stored in filepath. Arguments: filepath (string) : The path of the PyTorch classifier. Raises: TypeError: filepath is not string. FileNotFoundError: filepath not pointing to any file. NotPyTorchModelError: model can not be loaded. Returns: self """ type_check(filepath, str, "filepath") file_exists(filepath) ModelClass.load_state_dict(torch.load(filepath)) ModelClass.eval() self._pytorch_classifier = ModelClass return self
def extract_features(self, value: str): """Extract feature vector using SQLiGoT extractor. Arguments: value (str) : the input SQL query. Raises: TypeError: value is not string ModelNotLoadedError: calling function without having loaded or passed model as arg Returns: numpy ndarray : the feature vector """ if self._pytorch_classifier is None: raise ModelNotLoadedError() type_check(value, str, "value") # print("Modified String", value) new_value = ut.PreProc(value, self.model_number, self.vocab_to_int) # print("pre processed value", new_value) return new_value
def extract_features(self, value: str): """Extract feature vector using SQLiGoT extractor. Arguments: value (str) : the input SQL query. Raises: TypeError: value is not string ModelNotLoadedError: calling function without having loaded or passed model as arg Returns: numpy ndarray : the feature vector """ if self._sklearn_classifier is None: raise ModelNotLoadedError() type_check(value, str, "value") query = self._sklearn_classifier.preprocess_single_query( value, undirected=self._undirected, proportional=self._proportional ) return query
def load(self, filepath): """Loads a sklearn classifier stored in filepath. Arguments: filepath (string) : The path of the sklearn classifier. Raises: TypeError: filepath is not string. FileNotFoundError: filepath not pointing to any file. NotSklearnModelError: model can not be loaded. Returns: self """ type_check(filepath, str, "filepath") file_exists(filepath) try: self._sklearn_classifier = joblib.load(filepath) except Exception as e: raise NotSklearnModelError("Error in loading model.") from e return self
def replace_random(candidate, sub, wanted): """Replace one picked at random of the occurrence of sub inside candidate with wanted. Arguments: candidate (str) : the string to be modified sub (str) : regexp containing what to substitute wanted (str) : the string that will replace sub Raises: TypeError : bad type passed as arguments Returns: (str) : the modified string """ type_check(candidate, str, "candidate") type_check(sub, str, "sub") type_check(wanted, str, "wanted") occurrences = [m.start() for m in re.finditer(re.escape(sub), candidate)] if not occurrences: return candidate pos = random.choice(occurrences) before = candidate[:pos] after = candidate[pos:] after = after.replace(sub, wanted, 1) result = before + after return result
def __init__( self, sqligot_classifier: SQLiGoT = None, undirected=True, proportional=True ): """Constructs the wrapper. Arguments: Keyword Arguments: sqligot_classifier (SQLiGoT) : SQLiGoT object (default: None) undirected (bool) : set undirection for feature extraction (default: (True)) proportional (bool) : set weights for edges in graph (default: (True)) Raises: TypeError: wrong input types Returns: SQLiGoTWrapper : the object """ if sqligot_classifier is not None: type_check(sqligot_classifier, SQLiGoT, "sqligot_classifier") type_check(undirected, bool, "undirected") type_check(proportional, bool, "proportional") self._undirected = undirected self._proportional = proportional return super(SQLiGoTWrapper, self).__init__(sqligot_classifier)
def load(self, filepath): """Loads a keras classifier stored in filepath. Arguments: filepath (string) : The path of the keras classifier. Returns: self Raises: TypeError: filepath is not string. FileNotFoundError: filepath not pointing to any file. NotKerasModelError: model can not be loaded. """ type_check(filepath, str, "filepath") file_exists(filepath) try: self._keras_classifier = keras.models.load_model(filepath) except Exception as e: raise NotKerasModelError( "Can not load keras model. See inner exception for details." ) from e
def produce_feat_vector(self, sql_query: str, normalize=False): """It returns the feature vector as histogram of tokens, produced from the input query. Arguments: sql_query (str) : An input SQL query Keyword Arguments: normalize (bool) : True for producing a normalized hitogram. (default: (False)) Raises: TypeError: params has wrong types Returns: numpy ndarray : histogram of tokens """ type_check(sql_query, str, "sql_query") if normalize is not None: type_check(normalize, bool, "normalize") parsed = list(sqlparse.parse(sql_query)[0].flatten()) allowed = self._allowed_tokens tokens = self._produce_tokens(parsed) dict_token = OrderedDict(zip(allowed, [0 for _ in range(len(allowed))])) for t in tokens: if t in dict_token: dict_token[t] += 1 else: parent = t while parent is not None and parent not in dict_token: parent = parent.parent if parent is None: continue dict_token[parent] += 1 values = dict_token.values() feature_vector = np.array([i for i in values]) if normalize: norm = np.linalg.norm(feature_vector) feature_vector = feature_vector / norm return feature_vector
def create_dataset_from_file( self, filepath: str, label: int, limit: int = None, unique_rows=True ): """Create dataset from fil containing sql queries. Arguments: filepath (str) : path of sql queries dataset label (int) : labels to assign to each sample Keyword Arguments: limit (int) : if None, it specifies how many queries to use (default: (None)) unique_rows (bool) : True for removing all the duplicates (default: (True)) Raises: TypeError: params has wrong types FileNotFoundError: filepath not pointing to regular file TypeError: limit is not None and not int Returns: (numpy ndarray, list) : X and y """ type_check(filepath, str, "filepath") type_check(label, int, "label") type_check(unique_rows, bool, "unique_rows") if limit is not None: type_check(limit, int, "limit") file_exists(filepath) X = [] with open(filepath, "r") as f: i = 0 for line in f: if limit is not None and i > limit: break line = line.strip() X.append(self.produce_feat_vector(line)) i += 1 if unique_rows: X = np.unique(X, axis=0) else: X = np.array(X) y = [label for _ in X] return X, y
def preprocess_single_query( self, sql_query: str, undirected: bool = False, proportional: bool = True ): """Create feature vector from input query. Arguments: sql_query (str) : input sql query Keyword Arguments: undirected (bool) : create undirected graph if true (default: (False)) proportional (bool) : create weighted graph if true (default: (True)) Raises: TypeError: arguments are not typed correctly Returns: numpy ndarray : the feature vector extracted from the query """ type_check(sql_query, str, "sql_query") type_check(undirected, bool, "undirected") type_check(proportional, bool, "proportional") graph = self._create_graph_from_sql_query( sql_query, proportional=proportional, undirected=undirected ) if graph is None: return None extract_feat = ( self._extract_feature_vector_from_undirected_graph if undirected else self._extract_feature_vector_from_directed_graph ) feature_vector = extract_feat(graph) if feature_vector is None: return None return feature_vector
def extract_features(self, value: str): type_check(value, str, "value") tokenizer = Tokenizer() feature_vector = tokenizer.produce_feat_vector(value) return feature_vector
def create_dataset( self, benign_filepath: str, sqlia_filepath: str, undirected=False, proportional=True, normalize=True, limit_samples=10000, balance=True, dump_to_file=True, check_cache=True, save_keyword_append="", ): """Create dataset of both sqli and sane queries, using the input paths. If check_cache is true, it tries to load previously computed dataset. Arguments: benign_filepath (str) : path to sane queries sqlia_filepath (str) : path to sqli queries Raises: TypeError: arguments are not typed correctly Keyword Arguments: undirected (bool) : true for undirected graphs (default: (False)) proportional (bool) : true for weighted graphs (default: (True)) normalize (bool) : true for normalizing weights of edges (default: (True)) limit_samples (int) : if not None, how many queries per file to consider (default: (10000)) balance (bool) : true for balancing the number of sane and sqli queries (default: (True)) dump_to_file (bool) : true for storing the computed queries to file (default: (True)) check_cache (bool) : enable dump load (default: (True)) save_keyword_append (str) : append this string to both paths when saving results (default: ('')) Returns: (numpy ndarray, numpy ndarray) : X and y """ type_check(benign_filepath, str, "benign_path") type_check(undirected, bool, "undirected") type_check(proportional, bool, "proportional") type_check(normalize, bool, "normalize") type_check(balance, bool, "balance") type_check(dump_to_file, bool, "dump_to_file") type_check(save_keyword_append, str, "save_keyword_append") kind = "undirected" if undirected else "directed" prop = "proportional" if proportional else "unprop" if ( check_cache and os.path.isfile(benign_filepath) and os.path.isfile(sqlia_filepath) ): X_benign = np.load(benign_filepath) X_sqlia = np.load(sqlia_filepath) if balance: X_benign, X_sqlia = self._balance_data(X_benign, X_sqlia) X = np.vstack((X_benign, X_sqlia)) y = np.ones(len(X_benign) + len(X_sqlia)) y[: len(X_benign)] = -1 return X, y else: X_benign = self._create_feature_vectors_from_file( benign_filepath, undirected=undirected, proportional=proportional, normalize=normalize, limit_samples=limit_samples, ) X_sqlia = self._create_feature_vectors_from_file( sqlia_filepath, undirected=undirected, proportional=proportional, normalize=normalize, limit_samples=limit_samples, ) X_benign = np.unique(X_benign, axis=0) X_sqlia = np.unique(X_sqlia, axis=0) np.save( "{}graph_{}_{}_sane.npy".format(save_keyword_append, kind, prop), X_benign ) np.save( "{}graph_{}_{}_sqlia.npy".format(save_keyword_append, kind, prop), X_sqlia ) if balance: X_benign, X_sqlia = self._balance_data(X_benign, X_sqlia) X = np.vstack((X_benign, X_sqlia)) y = np.ones(len(X_benign) + len(X_sqlia)) y[: len(X_benign)] = -1 return X, y
def evaluate( self, payload: str, max_rounds: int = 1000, round_size: int = 20, timeout: int = 14400, threshold: float = 0.5, ): """It tries to produce a payloads that should be classified as a benign payload. Arguments: payload (str) : the initial payload max_rounds (int) : maximum number of mutation rounds round_size (int) : how many mutation for each round timeout (int) : number of seconds before the timeout threshold (float) : default 0.5, customizable for different results Raises: TypeError : input arguments are mistyped. Returns: float, str : minimum confidence and correspondent payload that achieve that score """ type_check(payload, str, "payload") type_check(max_rounds, int, "max_rounds") type_check(round_size, int, "round_size") type_check(timeout, int, "timeout") type_check(threshold, float, "threshold") def _signal_handler(signum, frame): raise TimeoutError() # Timeout setup signal.signal(signal.SIGALRM, _signal_handler) signal.alarm(timeout) evaluation_results = [] min_confidence, min_payload = self._mutation_round(payload, round_size) evaluation_results.append((min_confidence, min_payload)) try: while max_rounds > 0 and min_confidence > threshold: for candidate_confidence, candidate_payload in sorted( evaluation_results ): max_rounds -= 1 confidence, payload = self._mutation_round( candidate_payload, round_size ) if confidence < candidate_confidence: evaluation_results.append((confidence, payload)) min_confidence, min_payload = min(evaluation_results) break if min_confidence < threshold: print("[+] Threshold reached") elif max_rounds <= 0: print("[!] Max number of iterations reached") except TimeoutError: print("[!] Execution timed out") print( "Reached confidence {}\nwith payload\n{}".format( min_confidence, min_payload ) ) return min_confidence, min_payload