def dataSetGenKdd99(log): kddNormalizeMap = list(range(40)) kddNormalizeMap[1:3] = {}, {}, {} def kddNormalize(kddArr): # [0 b'tcp' b'http' b'SF' ... result = [] for i, kddMap, kddEntry in zip(range(len(kddArr)), kddNormalizeMap, kddArr): if i == 0 or i >= 4: result.append(float(kddEntry)) continue if not kddEntry in kddMap: kddMap[kddEntry] = len(kddMap) result.append(float(kddMap[kddEntry])) return result from sklearn.datasets import fetch_kddcup99 kddcup99 = fetch_kddcup99() # kddcup99.data.shape Out[2]: (494021, 41) 494 021 log.info(f'Dataset len {kddcup99.data.shape}') allData = list() for data, target in zip(kddcup99.data, kddcup99.target): data = kddNormalize(data) data = [float(i) for i in data] allData.append((data, target.decode(encoding='utf-8'))) # return allData
def _tabular_data(self): kddcup99_all_data = fetch_kddcup99() feature_names = [ 'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate' ] tabular_data_set = pd.DataFrame.from_dict(kddcup99_all_data['data']) tabular_data_set.columns = feature_names tabular_data_set = tabular_data_set.drop_duplicates( subset=feature_names, keep='first', inplace=True) traget = kddcup99_all_data['target'] new_tabular_data_set, traget = self._sub_sampling( tabular_data_set, traget) list_of_columns = ['protocol_type', 'service', 'flag'] X = self._set_tabular_df(data_to_work_on=new_tabular_data_set, list_of_columns=list_of_columns) return self._split_train_and_test_tabular_data( X, *self._create_value_mapping_for_target_tabular_data(traget))
def __init__(self, subset="SF", percent10=False): dataset = datasets.fetch_kddcup99(subset=subset, percent10=percent10) if subset == "SA": columns = sa_columns toDecode = toDecodeSA else: toDecode = toDecodeSF columns = sf_columns self.df = pd.DataFrame(dataset.data, columns=columns) assert len(self.df) > 0, f"{subset} dataset not loaded." self.df["target"] = dataset.target anomaly_rate = 1.0 - len( self.df.loc[self.df["target"] == b'normal.']) / len(self.df) print(f"SA anomaly rate is {anomaly_rate:.1%}") self.df["binary_target"] = [ 1 if x == b'normal.' else -1 for x in self.df["target"] ] le = preprocessing.LabelEncoder() for f in toDecode: self.df[f] = list(map(byte_decoder, self.df[f])) self.df[f] = le.fit_transform(self.df[f]) a, b, c, d = train_test_split(self.df.drop(["target", "binary_target"], axis=1), self.df["binary_target"], test_size=0.33, random_state=0) self.x_train = a self.x_test = b self.y_train = c self.y_test = d self.estimators = Estimators()
def main(): np.random.seed(20) def scorer(est, x, y): y_hat = est.predict(x) return classification.accuracy_score(y, y_hat) #x, y = make_classification(n_samples=1000, n_classes=4, n_informative=10) x, y = fetch_kddcup99(return_X_y=True) x = np.array(x[:, 4:], dtype=np.float32) y = preprocessing.LabelEncoder().fit_transform(y) # X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2) myclf = Mree(split_method=greedy_classification) score = cross_val_score(myclf, x, y, cv=5, scoring=scorer) print("Mine greedy classification result", score, np.mean(score)) myclf = Mree(split_method=greedy_classification_p_at_k) score = cross_val_score(myclf, x, y, cv=5, scoring=scorer) print("Mine greedy p@k classification result", score, np.mean(score)) myclf = Mree(split_method=random_classify_p_at_k) score = cross_val_score(myclf, x, y, cv=5, scoring=scorer) print("Mine random p@k classification result", score, np.mean(score)) clf = DecisionTreeClassifier(max_depth=10, max_features=20, min_impurity_decrease=0.000001) score = cross_val_score(clf, x, y, cv=5, scoring=scorer) print("Sklearn greedy classification result", score, np.mean(score)) clf = ExtraTreeClassifier(max_depth=10, max_features=20, min_impurity_decrease=0.000001) score = cross_val_score(clf, x, y, cv=5, scoring=scorer) print("sklearn random classification result", score, np.mean(score))
def get_dataset(name): from sklearn.preprocessing import scale data = [] if name == "cancer": from sklearn.datasets import load_breast_cancer dataset = load_breast_cancer() elif name == "digits": from sklearn.datasets import load_digits dataset = load_digits() elif name == "iris": from sklearn.datasets import load_iris dataset = load_iris() elif name == "boston": from sklearn.datasets import load_boston dataset = load_boston() elif name == "KDD": from sklearn.datasets import fetch_kddcup99 dataset = fetch_kddcup99(subset='SF') data = dataset.data[:2000, [0, 2, 3]] else: print("Unknown name of dataset") exit(-1) labels = dataset.target if data == []: data = scale(dataset.data) n_samples, n_features = data.shape n_elements = len(unique(labels)) return data, n_elements, labels, len(set(labels))
def test_shuffle(): try: dataset = fetch_kddcup99(random_state=0, subset='SA', shuffle=True, percent10=True, download_if_missing=False) except IOError: raise SkipTest("kddcup99 dataset can not be loaded.") assert(any(dataset.target[-100:] == b'normal.'))
def test_Kddcup99Numba(): from sklearn.datasets import fetch_kddcup99 kddcup99 = fetch_kddcup99() total = len(kddcup99.data) online = 442800 offline = 48791 # total - online - offline print(kddcup99.data[0]) kddNormalizeMap = list(range(40)) kddNormalizeMap[1:3] = {}, {}, {} def kddNormalize(kddArr): # [0 b'tcp' b'http' b'SF' ... result = [] for i, kddMap, kddEntry in zip(range(len(kddArr)), kddNormalizeMap, kddArr): if i == 0 or i >= 4: result.append(float(kddEntry)) continue if not kddEntry in kddMap: kddMap[kddEntry] = len(kddMap) result.append(float(kddMap[kddEntry])) return np.array(result) tenPercent = (total // 10) baseMapKddcup99 = [] for data, target in zip(kddcup99.data[:tenPercent], kddcup99.target[:tenPercent]): baseMapKddcup99.append({ 'item': kddNormalize(data), 'label': str(target) }) trainingDF = pd.DataFrame(baseMapKddcup99) print(trainingDF.head()) init = time.time() clusters = minasOffline(trainingDF) print( f'minasOffline(testKddcup99Numba) => {len(clusters)}, {time.time() - init} seconds' ) labels = [] for cl in clusters: if not cl.label in labels: print(cl) labels.append(cl.label) print('\n') minasOnline allZip = zip(map(kddNormalize, kddcup99.data[tenPercent + 1:]), map(str, kddcup99.target[tenPercent + 1:])) inputStream = (Example(item=i, label=t) for i, t in allZip) init = time.time() for o in metaMinas( minasOnline(inputStream, clusters, minDist=minDistNumba)): print(o) print( f'metaMinas(minasOnline(testKddcup99Numba) {time.time() - init} seconds' )
def load_dataset(): target = 'target' sf = datasets.fetch_kddcup99(subset='SF', percent10=False) dfSF = pd.DataFrame( sf.data, columns=["duration", "service", "src_bytes", "dst_bytes"]) assert len(dfSF) > 0, "SF dataset no loaded." dfSF[target] = sf.target return target, dfSF
def fetch(dataset='http', fetch_percent_10=True): if _Debug == True: pdb.set_trace() raw_data = fetch_kddcup99(subset=dataset, percent10=fetch_percent_10) unique_data, index = np.unique(raw_data.data.astype(float), axis=0, return_index=True) scaler_data = StandardScaler().fit_transform(raw_data.data[index]) scaler_data = MinMaxScaler().fit_transform(scaler_data) return scaler_data, raw_data.target[index]
def prepare_kddcup(): x, y = fetch_kddcup99(return_X_y=True) y = preprocessing.LabelEncoder().fit_transform(y) nominal_cols = [1, 2, 3] cv = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=0) for n, (train_index, test_index) in enumerate(cv.split(x, y)): X_train = x[train_index] X_test = x[test_index] y_train = y[train_index] y_test = y[test_index] yield n, X_train, y_train, X_test, y_test, nominal_cols
def load_kddcup99(): X, y = fetch_kddcup99(shuffle=1, return_X_y=True, percent10=False) categorical_features = [1, 2, 3] categorical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), ('onehot', OneHotEncoder(handle_unknown='ignore'))]) preprocessor = ColumnTransformer( transformers=[ ('cat', categorical_transformer, categorical_features)]) return preprocessor.fit_transform(X), LabelEncoder().fit_transform(y)
def test_percent10(): try: data = fetch_kddcup99(download_if_missing=False) except IOError: raise SkipTest("kddcup99 dataset can not be loaded.") assert data.data.shape == (494021, 41) assert data.target.shape == (494021,) data_shuffled = fetch_kddcup99(shuffle=True, random_state=0) assert data.data.shape == data_shuffled.data.shape assert data.target.shape == data_shuffled.target.shape data = fetch_kddcup99('SA') assert data.data.shape == (100655, 41) assert data.target.shape == (100655,) data = fetch_kddcup99('SF') assert data.data.shape == (73237, 4) assert data.target.shape == (73237,) data = fetch_kddcup99('http') assert data.data.shape == (58725, 3) assert data.target.shape == (58725,) data = fetch_kddcup99('smtp') assert data.data.shape == (9571, 3) assert data.target.shape == (9571,) fetch_func = partial(fetch_kddcup99, 'smtp') check_return_X_y(data, fetch_func)
def test_percent10(): try: data = fetch_kddcup99(download_if_missing=False) except IOError as e: if e.errno == errno.ENOENT: raise SkipTest("kddcup99 dataset can not be loaded.") assert_equal(data.data.shape, (494021, 41)) assert_equal(data.target.shape, (494021,)) data_shuffled = fetch_kddcup99(shuffle=True, random_state=0) assert_equal(data.data.shape, data_shuffled.data.shape) assert_equal(data.target.shape, data_shuffled.target.shape) data = fetch_kddcup99('SA') assert_equal(data.data.shape, (100655, 41)) assert_equal(data.target.shape, (100655,)) data = fetch_kddcup99('SF') assert_equal(data.data.shape, (73237, 4)) assert_equal(data.target.shape, (73237,)) data = fetch_kddcup99('http') assert_equal(data.data.shape, (58725, 3)) assert_equal(data.target.shape, (58725,)) data = fetch_kddcup99('smtp') assert_equal(data.data.shape, (9571, 3)) assert_equal(data.target.shape, (9571,))
def test_percent10(): try: data = fetch_kddcup99(download_if_missing=False) except IOError: raise SkipTest("kddcup99 dataset can not be loaded.") assert_equal(data.data.shape, (494021, 41)) assert_equal(data.target.shape, (494021,)) data_shuffled = fetch_kddcup99(shuffle=True, random_state=0) assert_equal(data.data.shape, data_shuffled.data.shape) assert_equal(data.target.shape, data_shuffled.target.shape) data = fetch_kddcup99('SA') assert_equal(data.data.shape, (100655, 41)) assert_equal(data.target.shape, (100655,)) data = fetch_kddcup99('SF') assert_equal(data.data.shape, (73237, 4)) assert_equal(data.target.shape, (73237,)) data = fetch_kddcup99('http') assert_equal(data.data.shape, (58725, 3)) assert_equal(data.target.shape, (58725,)) data = fetch_kddcup99('smtp') assert_equal(data.data.shape, (9571, 3)) assert_equal(data.target.shape, (9571,)) fetch_func = partial(fetch_kddcup99, 'smtp') check_return_X_y(data, fetch_func)
def test_percent10(): try: data = fetch_kddcup99(download_if_missing=False) except IOError: raise SkipTest("kddcup99 dataset can not be loaded.") assert_equal(data.data.shape, (494021, 41)) assert_equal(data.target.shape, (494021,)) data_shuffled = fetch_kddcup99(shuffle=True, random_state=0) assert_equal(data.data.shape, data_shuffled.data.shape) assert_equal(data.target.shape, data_shuffled.target.shape) data = fetch_kddcup99('SA') assert_equal(data.data.shape, (100655, 41)) assert_equal(data.target.shape, (100655,)) data = fetch_kddcup99('SF') assert_equal(data.data.shape, (73237, 4)) assert_equal(data.target.shape, (73237,)) data = fetch_kddcup99('http') assert_equal(data.data.shape, (58725, 3)) assert_equal(data.target.shape, (58725,)) data = fetch_kddcup99('smtp') assert_equal(data.data.shape, (9571, 3)) assert_equal(data.target.shape, (9571,))
def make_kddcup(n_samples): features, targets = datasets.fetch_kddcup99(subset='smtp') features = pd.DataFrame( features, columns=['feature_{}'.format(i) for i in range(features.shape[1])], dtype=np.float32) targets = pd.Series(targets, name='target', dtype=np.float32) targets = targets.map(lambda x: 1.0 if x > 0 else 0.0) features = featurse.sample(n=n_samples) return features, targets.loc[features.index]
def kddcup(percent10, random_state=1): data = fetch_kddcup99(percent10=percent10) x = data.data y_ori = data.target y = np.array([1 if l == b'normal.' else -1 for l in y_ori]) labelencoder_x_1 = LabelEncoder() labelencoder_x_2 = LabelEncoder() labelencoder_x_3 = LabelEncoder() x[:, 1] = labelencoder_x_1.fit_transform(x[:, 1]) x[:, 2] = labelencoder_x_2.fit_transform(x[:, 2]) x[:, 3] = labelencoder_x_3.fit_transform(x[:, 3]) onehotencoder_1 = OneHotEncoder(categorical_features=[1]) x = onehotencoder_1.fit_transform(x).toarray() onehotencoder_2 = OneHotEncoder(categorical_features=[4]) x = onehotencoder_2.fit_transform(x).toarray() onehotencoder_3 = OneHotEncoder(categorical_features=[70]) x = onehotencoder_3.fit_transform(x).toarray() normal = x[np.where(y == 1)] anomalies = x[np.where(y == -1)] anomalies = shuffle(anomalies, random_state=1) anomalies = anomalies[:int(len(normal)/19)] scaler = MinMaxScaler() scaler.fit(np.concatenate((normal, anomalies), axis=0)) normal = scaler.transform(normal) anomalies = scaler.transform(anomalies) x = np.concatenate((normal, anomalies), axis=0) y = np.concatenate(([1] * len(normal), [-1] * len(anomalies)), axis=0) x, y = shuffle(x, y, random_state=random_state) normal = x[np.where(y == 1)] test_normal = normal[int(len(normal) / 2):] normal = normal[:int(len(normal) / 2)] anomalies = x[np.where(y == -1)] test_anomalies = anomalies[int(len(anomalies) / 2):] anomalies = anomalies[:int(len(anomalies) / 2)] x_train = np.concatenate((normal, anomalies), axis=0) y_train = np.concatenate(([1] * len(normal), [-1] * len(anomalies)), axis=0) x_train, y_train = shuffle(x_train, y_train, random_state=1) x_test = np.concatenate((test_normal, test_anomalies), axis=0) y_test = np.concatenate(([1] * len(test_normal), [-1] * len(test_anomalies)), axis=0) x_test, y_test = shuffle(x_test, y_test, random_state=1) return x_train, y_train, x_test, y_test
def load_kddcup99(): X, y = fetch_kddcup99(shuffle=True, return_X_y=True) df_X = pd.DataFrame(X) X = pd.get_dummies(df_X, columns=[1, 2, 3], prefix=['protocol_type', "service", "flag"]).values.astype(np.float32) max_by_col = np.max(X, axis=0) min_by_col = np.min(X, axis=0) X = (X - min_by_col) / (max_by_col - min_by_col) X = X[:, ~np.any(np.isnan(X), axis=0)] label_encoder = preprocessing.LabelEncoder() y = label_encoder.fit_transform(y.reshape(-1, 1)) return X, y
def load_dataset(): target = 'target' sf = datasets.fetch_kddcup99(subset='SF', percent10=False) dfSF = pd.DataFrame( sf.data, columns=["duration", "service", "src_bytes", "dst_bytes"]) assert len(dfSF) > 0, "SF dataset no loaded." dfSF[target] = sf.target anomaly_rateSF = 1.0 - len( dfSF.loc[dfSF[target] == b'normal.']) / len(dfSF) # 计算数据集数量 print("kddcup长度:", len(dfSF)) # 计算真实异常率 print("SF Anomaly Rate is:" + "{:.1%}".format(anomaly_rateSF)) return target, dfSF
def __init__(self, batch_size): kddcup99 = datasets.fetch_kddcup99() self._encoder = { 'protocal': LabelEncoder(), 'service': LabelEncoder(), 'flag': LabelEncoder(), 'label': LabelEncoder() } self.batch_size = batch_size data_X, data_y = self.__encode_data(kddcup99.data, kddcup99.target) self.train_dataset, self.test_dataset = self.__split_data_to_tensor( data_X, data_y) self.train_dataloader = DataLoader( self.train_dataset, self.batch_size, shuffle=True) self.test_dataloader = DataLoader( self.test_dataset, self.batch_size, shuffle=True)
def load_train_test_data( small: bool, train_normal_only: bool ) -> Tuple[Tuple[pd.DataFrame, np.ndarray], Tuple[pd.DataFrame, np.ndarray]]: X, y = fetch_kddcup99(subset='SA', percent10=small, return_X_y=True) columns = [ "duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", "land", "wrong_fragment", "urgent", "hot", "num_failed_logins", "logged_in", "num_compromised", "root_shell", "su_attempted", "num_root", "num_file_creations", "num_shells", "num_access_files", "num_outbound_cmds", "is_host_login", "is_guest_login", "count", "srv_count", "serror_rate", "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate", "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count", "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate", "dst_host_rerror_rate", "dst_host_srv_rerror_rate" ] categorical_columns = ["protocol_type", "flag", "service"] features = pd.DataFrame(X, columns=columns) target = (y == b'normal.') * 1 for categorical_column in categorical_columns: features[categorical_column] = features[categorical_column].astype( 'category') number_anomalies = np.sum(1 - target) number_test_samples = 2 * number_anomalies if train_normal_only: features_train, features_test = features.iloc[: -number_test_samples], features.iloc[ -number_test_samples:] target_train, target_test = target[:-number_test_samples], target[ -number_test_samples:] else: test_indices = np.random.choice(a=range(len(features)), size=number_test_samples, replace=False) features_train, features_test = features.drop( test_indices), features.loc[test_indices] target_train, target_test = np.delete( target, test_indices), target[test_indices] return (features_train, target_train), (features_test, target_test) # features, target= load_train_test_data(small=True, train_normal_only=True) # print(features.columns)
def test_closs_validation(self): trainer = Trainer() kf = KFold(n_splits=3) self.features, self.labels = fetch_kddcup99(subset="http", return_X_y=True) self.labels = list( map(lambda label: 0 if label == b"normal." else 1, self.labels)) self.labels = np.array(self.labels) for train_index, test_index in kf.split(self.features, self.labels): train_data = self.features[train_index] test_data = self.features[test_index] train_label = self.labels[train_index] test_label = self.labels[test_index] trainer.train(train_data) result = trainer.model.predict(test_data) accuracy = accuracy_score(test_label, result) print("正解率=", accuracy) assert accuracy > 0.8
def readKDD99(config): info("Getting KDD '99 data.") datadir = getDataDir(config) outputConfig = config['output'] compress = outputConfig['compress'] dataName = setFile(datadir, outputConfig['name']) featureConfig = config['feature'] dlFile = setFile(datadir, featureConfig['dropList']) if isFile(dataName) and isFile(dlFile): info("Loading previously create data frames") pddf = getJoblib(dataName) else: info("Downloading KDD '99 data", ind=2) tmp = datasets.fetch_kddcup99() X = tmp['data'] y = tmp['target'] y = y.reshape((y.shape[0], 1)) pddf = DataFrame(append(arr=X, values=y, axis=1)) tmp = pddf.head(n=1000) for column in tmp.columns: try: tmp[column].mean() pddf[column] = to_numeric(pddf[column], errors="coerce") except: continue colFile = setFile(datadir, "names.dat") colnames = open(colFile).readlines() targets = colnames[0].split(",") columns = [x.split(":")[0] for x in colnames[1:]] columns.append("TARGET") pddf.columns = columns info("Saving data to {0}".format(dataName)) saveJoblib(jlfile=dataName, jldata=pddf, compress=compress) info("Saving feature data to {0}".format(dlFile)) writeDropList(dlFile, pddf, dlData=None) return pddf
def get_kddcup99_sf(): X, y = fetch_kddcup99(subset='SF', random_state=42, return_X_y=True) lb = LabelBinarizer() x1 = lb.fit_transform(X[:, 1].astype(str)) X = np.c_[X[:, :1], x1, X[:, 2:]] y = y.astype('str') y_df = pd.DataFrame(y, columns=['class']) y_df.loc[y_df['class'] != 'normal.', 'class'] = -1 y_df.loc[y_df['class'] == 'normal.', 'class'] = 1 y = y_df.values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) return X_train, X_test, y_train, y_test, 'kddcup99_sf'
def get_kddcup99_http(): # fetch data X, y = fetch_kddcup99(subset='http', random_state=42, return_X_y=True) X, y = X.astype(np.float32), y.astype('str') # fix classes y_df = pd.DataFrame(y, columns=['class']) y_df.loc[y_df['class'] != 'normal.', 'class'] = -1 y_df.loc[y_df['class'] == 'normal.', 'class'] = 1 y = y_df.values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) return X_train, X_test, y_train, y_test, 'kddcup99_http'
def get_dataset(name): from sklearn.preprocessing import scale data = [] if name == "cancer": from sklearn.datasets import load_breast_cancer dataset = load_breast_cancer() elif name == "digits": from sklearn.datasets import load_digits dataset = load_digits() elif name == "iris": from sklearn.datasets import load_iris dataset = load_iris() elif name == "boston": from sklearn.datasets import load_boston dataset = load_boston() elif name == "KDD": from sklearn.datasets import fetch_kddcup99 dataset = fetch_kddcup99(subset='SF') data = dataset.data[:2000, [0, 2, 3]] elif name == "newsgroup": from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.datasets import fetch_20newsgroups dataset = fetch_20newsgroups(subset='train') vectorizer = TfidfVectorizer() data = vectorizer.fit_transform(dataset.data) # data = vectors.nnz / float(vectors.shape[0]) labels = dataset.target return data.toarray(), 1, labels, len(set(labels)) else: print("Unknown name of dataset") exit(-1) labels = dataset.target if data == []: data = scale(dataset.data) n_samples, n_features = data.shape #n_elements = len(unique(labels)) return data, 1, labels, len(set(labels))
def X_y_dataset(self, remove_duplicates: bool = False, full_dataset: bool = True, force: bool = False) -> np.array: """ Helper function to create the dataset, including the dependant "target" variable. :param remove_duplicates: Flag to decide whether duplicates should be reduced using Dataframe.drop_duplicates :param full_dataset: Flag to decide if full dataset or only 10% should be retrieved. :param force: Flag to force re-retrieval of X and y from source or used locally stored (X, y) from previous call. :return: The dataset as (X, y). """ # Lazy init if self._X is None or self._y is None or force is True: logger.info(f"Step - Only 10% of Dataset: {(not full_dataset)}") data, target = fetch_kddcup99(return_X_y=True, percent10=(not full_dataset), random_state=RANDOM_STATE) target = np.array(target).reshape(-1, 1) self._X = pd.DataFrame(data=data, columns=self.label_manager.X_column_names) self._y = pd.DataFrame(data=target, columns=self.label_manager.y_column_name) if remove_duplicates: self._remove_duplicate_rows() return self._X, self._y
from sklearn.utils import shuffle as sh print(__doc__) np.random.seed(2) # datasets available: ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover'] datasets = ['shuttle'] novelty_detection = True # if False, training set polluted by outliers for dataset_name in datasets: # loading and vectorization print('loading data') if dataset_name in ['http', 'smtp', 'SA', 'SF']: dataset = fetch_kddcup99(subset=dataset_name, shuffle=True, percent10=False) X = dataset.data y = dataset.target if dataset_name == 'shuttle': dataset = fetch_mldata('shuttle') X = dataset.data y = dataset.target X, y = sh(X, y) # we remove data with label 4 # normal data are then those of class 1 s = (y != 4) X = X[s, :] y = y[s] y = (y != 1).astype(int)
from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import classification_report, confusion_matrix from sklearn.datasets import fetch_kddcup99 import numpy as np import random random_seed = 42 random.seed(random_seed) np.random.seed(random_seed) dataset_name = "Digits" #%% Generate classes print('Generating classes') x, y = fetch_kddcup99(return_X_y=True, subset='http', percent10=True) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=random_seed) #%% Perceptron #TODO: Inserir Perceptron pra mostrar separação ideal #%% Define model for classification print('Building model') # Do not interrupt the training before end of the epochs, to force an # overfitting
def load_data(scale_data=False, transform_data=False, random_slice=None, random_seed=None, dataset='breast_cancer'): if random_seed is not None: np.random.seed(random_seed) if dataset == 'breast_cancer': data = datasets.load_breast_cancer() elif dataset == 'kdd': data = datasets.fetch_kddcup99() #data = datasets.fetch_covtype() X = data.data Y = data.target #np.savetxt("/home/btodorov/Desktop/foo.csv", X[np.random.choice(Y.shape[0], 1000, replace=False), :], delimiter=",") ten_random_records = np.random.choice(Y.shape[0], 10, replace=False) # print(X[ten_random_records, :]) # print('-----------------------------------------------') # print(Y[ten_random_records]) # print('-----------------------------------------------') print('X.shape: ', X.shape) print('Y.shape: ', Y.shape) print('-----------------------------------------------') if random_slice is not None: random_indices = np.random.choice( Y.shape[0], random_slice if random_slice < Y.shape[0] else Y.shape[0], replace=False) X = X[random_indices, :] Y = Y[random_indices] if transform_data: for i in [1, 2, 3]: print(X[0, i]) le = preprocessing.LabelEncoder() le.fit(X[:, i]) X[:, i] = le.transform(X[:, i]) print('Min-Max {0}: {1}-{2}'.format(i, np.min(X[:, i]), np.max(X[:, i]))) le = preprocessing.LabelEncoder() le.fit(Y) Y = le.transform(Y) print(np.amin(X, axis=0)) print(np.amax(X, axis=0)) print(np.var(X, axis=0)) print('1-----------------------------------------------') if scale_data: X = preprocessing.scale(X) #X = preprocessing.MinMaxScaler().fit_transform(X) # for i in range(X.shape[1]): # print('Min-Max {0}: {1}-{2}'.format(i, np.min(X[:, i]), np.max(X[:, i]))) print(np.amin(X, axis=0)) print(np.amax(X, axis=0)) print(np.var(X, axis=0)) print('2-----------------------------------------------') shuffled_indices = np.random.choice(Y.shape[0], Y.shape[0], replace=False) X_shuffled = X[shuffled_indices, :] Y_shuffled = Y[shuffled_indices] return X_shuffled, Y_shuffled
from id3 import Id3Estimator from sklearn.datasets import fetch_kddcup99 from sklearn.model_selection import train_test_split from id3 import export_graphviz import numpy as np bunch = fetch_kddcup99(subset="SA") data = bunch.data data = np.delete(data, np.s_[1:4], axis=1) target = bunch.target X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=.2, random_state=17) estimator = Id3Estimator() print("->Fitting ID3 classifier") estimator.fit(X_train, y_train) print("->Writing dot file") export_graphviz(estimator.tree_, 'tree.dot') print("->Calculating predictions") pred = estimator.predict(X_test) well_detected = 0 for index, val in enumerate(pred): if val == y_test[index]: well_detected += 1
def fetch_kdd( target: list = ['dos', 'r2l', 'u2r', 'probe'], keep_cols: list = [ 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate' ], percent10: bool = True, return_X_y: bool = False ) -> Union[Bunch, Tuple[np.ndarray, np.ndarray]]: """ KDD Cup '99 dataset. Detect computer network intrusions. Parameters ---------- target List with attack types to detect. keep_cols List with columns to keep. Defaults to continuous features. percent10 Bool, whether to only return 10% of the data. return_X_y Bool, whether to only return the data and target values or a Bunch object. Returns ------- Bunch Dataset and outlier labels (0 means 'normal' and 1 means 'outlier'). (data, target) Tuple if 'return_X_y' equals True. """ # fetch raw data data_raw = fetch_kddcup99(subset=None, data_home=None, percent10=percent10) # specify columns cols = [ 'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate' ] # create dataframe data = pd.DataFrame(data=data_raw['data'], columns=cols) # add target to dataframe data['attack_type'] = data_raw['target'] # specify and map attack types attack_list = np.unique(data['attack_type']) attack_category = [ 'dos', 'u2r', 'r2l', 'r2l', 'r2l', 'probe', 'dos', 'u2r', 'r2l', 'dos', 'probe', 'normal', 'u2r', 'r2l', 'dos', 'probe', 'u2r', 'probe', 'dos', 'r2l', 'dos', 'r2l', 'r2l' ] attack_types = {} for i, j in zip(attack_list, attack_category): attack_types[i] = j data['attack_category'] = 'normal' for k, v in attack_types.items(): data['attack_category'][data['attack_type'] == k] = v # define target data['target'] = 0 for t in target: data['target'][data['attack_category'] == t] = 1 is_outlier = data['target'].values # define columns to be dropped drop_cols = [] for col in data.columns.values: if col not in keep_cols: drop_cols.append(col) if drop_cols != []: data.drop(columns=drop_cols, inplace=True) if return_X_y: return data.values, is_outlier return Bunch(data=data.values, target=is_outlier, target_names=['normal', 'outlier'], feature_names=keep_cols)
from sklearn.preprocessing import LabelBinarizer print(__doc__) random_state = 2 # to control the random selection of anomalies in SA # datasets available: ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover'] datasets = ["http", "smtp", "SA", "SF", "shuttle", "forestcover"] plt.figure() for dataset_name in datasets: # loading and vectorization print("loading data") if dataset_name in ["http", "smtp", "SA", "SF"]: dataset = fetch_kddcup99(subset=dataset_name, percent10=True, random_state=random_state) X = dataset.data y = dataset.target if dataset_name == "shuttle": dataset = fetch_openml("shuttle", as_frame=False, parser="pandas") X = dataset.data y = dataset.target.astype(np.int64) # we remove data with label 4 # normal data are then those of class 1 s = y != 4 X = X[s, :] y = y[s] y = (y != 1).astype(int)
if dat == 'synthetic1': L, S = gen_synthetic(500, 0.05, 25) X = L + S print('Data Rank = %d, Data NNZs = %d' % (matrix_rank(L), np.count_nonzero(S))) if dat == 'synthetic2': L, S = gen_synthetic(1000, 0.05, 25) X = L + S print('Data Rank = %d, Data NNZs = %d' % (matrix_rank(L), np.count_nonzero(S))) if dat in ['http', 'smtp', 'SA', 'SF']: dataset = fetch_kddcup99(subset=dat, shuffle=True, percent10=True) X = dataset.data y = dataset.target if dat == 'shuttle': dataset = fetch_mldata('shuttle') X = dataset.data y = dataset.target sh(X, y) # we remove data with label 4 # normal data are then those of class 1 s = (y != 4) X = X[s, :] y = y[s] y = (y != 1).astype(int)
from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_openml from sklearn.preprocessing import LabelBinarizer print(__doc__) random_state = 2 # to control the random selection of anomalies in SA # datasets available: ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover'] datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover'] plt.figure() for dataset_name in datasets: # loading and vectorization print('loading data') if dataset_name in ['http', 'smtp', 'SA', 'SF']: dataset = fetch_kddcup99(subset=dataset_name, percent10=True, random_state=random_state) X = dataset.data y = dataset.target if dataset_name == 'shuttle': dataset = fetch_openml('shuttle') X = dataset.data y = dataset.target # we remove data with label 4 # normal data are then those of class 1 s = (y != 4) X = X[s, :] y = y[s] y = (y != 1).astype(int) if dataset_name == 'forestcover':
fig_roc, ax_roc = plt.subplots(1, 1, figsize=(8, 5)) # Set this to true for plotting score histograms for each dataset: with_decision_function_histograms = False # datasets available = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover'] datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover'] # Loop over all datasets for fitting and scoring the estimator: for dat in datasets: # Loading and vectorizing the data: print('====== %s ======' % dat) print('--- Fetching data...') if dat in ['http', 'smtp', 'SF', 'SA']: dataset = fetch_kddcup99(subset=dat, shuffle=True, percent10=True, random_state=random_state) X = dataset.data y = dataset.target if dat == 'shuttle': dataset = fetch_mldata('shuttle') X = dataset.data y = dataset.target X, y = sh(X, y, random_state=random_state) # we remove data with label 4 # normal data are then those of class 1 s = (y != 4) X = X[s, :] y = y[s] y = (y != 1).astype(int) print('----- ')