예제 #1
0
def read_data(data_dir, duration=None, labels=None):
    '''
    Reads all the data in the specified directory and parses it into
    a feature array and a label array.

    Args:
        data_dir: path to the directory that contains the training data
        duration: Time window to compute feature information
        labels: List containing labels to use

    Returns:
        X: numpy 2D array that contains the (high dimensional) features
        y: numpy 1D array that contains the labels for the features in X
        new_labels: Reordered labels used in training
    '''
    X = []
    y = []
    assigned_labels = []

    # Get all the files in the directory
    files = []
    with open(os.path.join(data_dir,'label_assignments.json')) as handle:
        label_assignments = json.load(handle)

    for dirpath, dirnames, filenames in os.walk(data_dir):
        for file in filenames:
            files.append(os.path.join(dirpath,file))
    # Go through all the files in the directory
    for filename in files:
        # Extract the label from the filename
        name = os.path.split(filename)[1]
        name = name.split('-')[0]
        if name in label_assignments:
            label = label_assignments[name]
            if label not in labels: label = 'Unknown'
        else:
            label = 'Unknown'
        if label not in assigned_labels:
            assigned_labels.append(label)

        print("Reading", filename,"as",label)
        # Bin the sessions with the specified time window
        binned_sessions = sessionizer(
                                       filename,
                                       duration=duration
                                     )

        # For each of the session bins, compute the  full feature vectors
        for session_dict in binned_sessions:
            features, _, _ = extract_features(session_dict)
            # Store the feature vector and the labels
            X.append(features)
            y.append(assigned_labels.index(label))

        # Update the labels to reflect the new assignments
        new_labels = assigned_labels + \
                     [l for l in labels if l not in assigned_labels]

    return np.stack(X), np.stack(y), new_labels
예제 #2
0
    def get_features(self, filepath, source_ip=None):
        '''
        Reads a pcap specified by the file path and returns an array of the
        computed model inputs

        Args:
            filepath: Path to pcap to compute features for

        Returns:
            features: Numpy 2D array containing features for each time bin
            timestamp: datetime of the last observed packet
        '''

        # Read the capture into a feature array
        X = []
        timestamps = []
        binned_sessions = sessionizer(filepath, duration=self.duration)
        self.sessions = binned_sessions

        if len(binned_sessions) is 0:
            return None, None, None, None

        for session_dict in binned_sessions:
            if len(session_dict) > 0:
                if source_ip is None:
                    feature_list, source_ip, other_ips = extract_features(
                        session_dict
                    )
                else:
                    feature_list, _, other_ips = extract_features(
                        session_dict,
                        capture_source=source_ip
                    )
                X.append(feature_list)
                last_packet = list(session_dict.items())[-1]
                timestamps.append(last_packet[1][0][0])

        if len(X) == 0:
            return None, None, None, None

        full_features = np.stack(X)

        # Mean normalize the features
        full_features -= np.expand_dims(self.means, 0)
        full_features /= np.expand_dims(self.stds, 0)
        features = full_features[:, self.feature_list]
        return features, source_ip, timestamps, other_ips
예제 #3
0
def read_data(data_dir, duration=None, labels=None):
    '''
    Reads all the data in the specified directory and parses it into
    a feature array and a label array.

    Args:
        data_dir: path to the directory that contains the training data
        duration: Time window to compute feature information
        labels: List containing labels to use

    Returns:
        X: numpy 2D array that contains the (high dimensional) features
        y: numpy 1D array that contains the labels for the features in X
        new_labels: Reordered labels used in training
    '''
    logger = logging.getLogger(__name__)
    try:
        if 'LOG_LEVEL' in os.environ and os.environ['LOG_LEVEL'] != '':
            logger.setLevel(os.environ['LOG_LEVEL'])
    except Exception as e:
        logger.error(
            'Unable to set logging level because: {0} defaulting to INFO.'.
            format(str(e)))
    X = []
    y = []
    assigned_labels = []

    # Get all the files in the directory
    files = []
    with open('opts/label_assignments.json') as handle:
        label_assignments = json.load(handle)

    for dirpath, _, filenames in os.walk(data_dir):
        for file in filenames:
            _, ext = os.path.splitext(file)
            if ext == '.pcap':
                files.append(os.path.join(dirpath, file))
    # Go through all the files in the directory
    logger.info('Found {0} pcap files to read.'.format(len(files)))
    count = 0
    for filename in files:
        count += 1
        # Extract the label from the filename
        name = os.path.split(filename)[1]
        name = name.split('-')[0]
        if name in label_assignments:
            label = label_assignments[name]
            if label not in labels:
                label = 'Unknown'
        else:
            label = 'Unknown'
        if label not in assigned_labels:
            assigned_labels.append(label)

        logger.info('Reading {0} ({1} bytes) as {2} ({3}/{4})'.format(
            filename, os.path.getsize(filename), label, count, len(files)))
        # Bin the sessions with the specified time window
        binned_sessions = sessionizer(filename, duration=duration)
        # Get the capture source from the binned sessions
        capture_source = get_source(binned_sessions)

        # For each of the session bins, compute the  full feature vectors
        for session_dict in binned_sessions:
            features, _, _ = extract_features(session_dict,
                                              capture_source=capture_source)

            # Store the feature vector and the labels
            X.append(features)
            y.append(assigned_labels.index(label))

        # Update the labels to reflect the new assignments
        new_labels = assigned_labels + \
            [l for l in labels if l not in assigned_labels]

    return np.stack(X), np.stack(y), new_labels