def transform(self, features: Features) -> Features: indexes_to_keep = [] remove_indexes = [] for i, name in enumerate(features.names): if name in self.features: indexes_to_keep.append(i) else: remove_indexes.append(i) filtered_data = features.as_pandas() filtered_data.drop(columns=filtered_data.columns[remove_indexes], inplace=True) indexes_to_keep = set(indexes_to_keep) filtered_types = [ f_type for i, f_type in enumerate(features.types) if i in indexes_to_keep ] filtered_features = Features.from_pandas(df=filtered_data, types=filtered_types) return filtered_features
def fit_transform(self, features: Features) -> Features: df = features.as_pandas(copy=False) cat_columns = [ features.names[i] for i, tf in enumerate(features.types) if tf == FeatureType.CATEGORIAL ] for name in cat_columns: values = set(df[name].tolist()) logging.debug("Got values %s for feature %s: ", values, name) self.feature_values[name] = list(values) return self._transform_df(df, features)
def _extract_additional_features(self, flows: t.List[NetFlow]) -> Features: features = [self._make_flow_features(flow) for flow in flows] return Features( data=np.array(features), names=[ "mean_payl_dist", "min_payl_dist", "max_payl_dist", "std_payl_dist" ], types=[ FeatureType.FLOAT, FeatureType.FLOAT, FeatureType.FLOAT, FeatureType.FLOAT, ], )
def _transform_df(self, df: pandas.DataFrame, original_features: Features) -> Features: new_types = [] for col_name, values in self.feature_values.items(): for value in values: new_col_name = col_name + "_ohe_" + str(value) df[new_col_name] = df[col_name].apply(lambda x: 1 if x == value else 0) new_types.append(FeatureType.BINARY) df.drop(col_name, axis=1, inplace=True) types = [ original_features.types[i] for i, n in enumerate(original_features.names) if n not in self.feature_values.keys() ] + new_types return Features.from_pandas(df, types)
def _extract_additional_features(self, flows: t.List[NetFlow]) -> Features: payload_features = [] for flow in flows: counts = [0] * 256 total_bytes = 0 for _, ip in flow.packets: for byte in bytes(ip.data): counts[byte] = counts[byte] + 1 total_bytes += 1 distribution = [abs_freq / total_bytes for abs_freq in counts] payload_features.append(distribution) return Features( data=np.array(payload_features, ndmin=2), names=["freq_byte_%s" % i for i in range(256)], types=[FeatureType.FLOAT for _ in range(256)], )
def _extract_flow_features(self, flows: t.List[NetFlow]) -> Features: features = [] names, types = self._make_flow_names_types() for i, f in enumerate( tqdm( flows, desc="Extract statistical flow features", disable=(not self.verbose), )): if FeatureSetMode.BASIC in self.modes: features.append([f.src_port, f.dest_port, f.protocol]) continue forward_packets = f.get_packets_in_direction( FlowDirection.FORWARDS) backward_packets = f.get_packets_in_direction( FlowDirection.BACKWARDS) total = self._extract_packet_list_features(f.packets) forward = self._extract_packet_list_features(forward_packets) backward = self._extract_packet_list_features(backward_packets) features_row = (self._get_port_features(f.src_port) + self._get_port_features(f.dest_port) + [f.protocol] + total + forward + backward) if FeatureSetMode.WITH_IP_ADDR in self.modes: src_features = self._get_ip_addr_features(f.src_ip) dest_features = self._get_ip_addr_features(f.dest_ip) features_row += src_features + dest_features if FeatureSetMode.SUBFLOWS in self.modes: active_idle_features = self._extract_active_idle_features( f.packets) subflows_forward = self._extract_subflow_features( forward_packets) subflows_backward = self._extract_subflow_features( backward_packets) features_row += (active_idle_features + subflows_forward + subflows_backward) if FeatureSetMode.TCP in self.modes: features_row += self._make_tcp_features( f, forward_packets, backward_packets) if FeatureSetMode.HINDSIGHT in self.modes: window_start = int(max(0, i - self.hindsight_window)) last_flows = flows[window_start:i] features_row += self._make_hindsight_features(f, last_flows) features.append(features_row) return Features(data=np.array(features), names=names, types=types)
def transform(self, features: Features) -> Features: df = features.as_pandas(copy=False) return self._transform_df(df, features)
def transform(self, features: Features) -> Features: data = self._scaler.transform(features.data) names, types = self.transform_feature_type_names(features) features = Features(data=data, names=names, types=types) features.validate() return features