예제 #1
0
    def _load(self, file_path):
        """Loads and parses a dataframe from a file.

        Args:
            file_path (str): File to be loaded.

        Returns:
            Arrays holding the features and labels.

        """

        # Getting file extension
        extension = file_path.split('.')[-1]

        if extension == 'csv':
            data = loader.load_csv(file_path)

        elif extension == 'txt':
            data = loader.load_txt(file_path)

        elif extension == 'json':
            data = loader.load_json(file_path)

        else:
            raise e.ArgumentError(
                'File extension not recognized. It should be `.csv`, `.json` or `.txt`'
            )

        X, Y = p.parse_loader(data)

        return X, Y
예제 #2
0
def load_dataset(dataset='NSL-KDD'):
    """Loads a dataset.

    Args:
        dataset (str): Dataset's identifier.

    Returns:
        X and Y (samples and labels).

    """

    # If the dataset is `nslkdd`
    if dataset == 'nsl-kdd':
        # Loading a .txt file to a numpy array
        txt = l.load_txt('data/nsl-kdd.txt')

    # If the dataset is `unespy`
    elif dataset == 'unespy':
        # Loading a .txt file to a numpy array
        txt = l.load_txt('data/unespy.txt')

    # Parsing a pre-loaded numpy array
    X, Y = p.parse_loader(txt)

    return X, Y
예제 #3
0
def test_opf_pre_compute_distances():
    txt = loader.load_txt('data/boat.txt')

    X, Y = parser.parse_loader(txt)

    X_train, _, _, _ = splitter.split(X, Y, 0.5, 1)

    general.pre_compute_distance(X_train, 'boat_split_distances.txt', 'log_squared_euclidean')
예제 #4
0
def test_parse_loader():
    X, Y = parser.parse_loader([])

    assert X is None
    assert Y is None

    try:
        data = np.ones((4, 4))
        X, Y = parser.parse_loader(data)
    except:
        try:
            data = np.ones((4, 4))
            data[3, 1] = 3
            X, Y = parser.parse_loader(data)
        except:
            csv = loader.load_csv('data/boat.csv')

            X, Y = parser.parse_loader(csv)

            assert X.shape == (100, 2)
            assert Y.shape == (100, )
예제 #5
0
def load_dataset(file_path):
    """Loads data from a .txt file and parses it.

    Args:
        file_path (str): Input file to be loaded.

    Returns:
        Samples and labels arrays.

    """

    # Loading a .txt file to a numpy array
    txt = l.load_txt(file_path)

    # Parsing a pre-loaded numpy array
    X, Y = p.parse_loader(txt)

    return X, Y
예제 #6
0
    def _load(self, file_path):
        """Loads and parses a dataframe from a file.

        Args:
            file_path (str): File to be loaded.

        Returns:
            Arrays holding the features and labels.

        """

        # Getting file extension
        extension = file_path.split('.')[-1]

        # Check if extension is .csv
        if extension == 'csv':
            # If yes, call the method that actually loads csv
            data = loader.load_csv(file_path)

        # Check if extension is .txt
        elif extension == 'txt':
            # If yes, call the method that actually loads txt
            data = loader.load_txt(file_path)

        # Check if extension is .json
        elif extension == 'json':
            # If yes, call the method that actually loads json
            data = loader.load_json(file_path)

        # If extension is not recognized
        else:
            # Raises an ArgumentError exception
            raise e.ArgumentError(
                'File extension not recognized. It should be `.csv`, `.json` or `.txt`'
            )

        # Parsing array
        X, Y = p.parse_loader(data)

        return X, Y
예제 #7
0
def load_split_dataset(file_path, train_split=0.5, random_state=1):
    """Loads data from a .txt file, parses it and splits into training and validation sets.

    Args:
        file_path (str): Input file to be loaded.
        train_split (float): Percentage of training set.
        random_state (int): Seed used to provide a deterministic trait.

    Returns:
        Training and validation sets along their indexes.

    """

    # Loading a .txt file to a numpy array
    txt = l.load_txt(file_path)

    # Parsing a pre-loaded numpy array
    X, Y = p.parse_loader(txt)

    # Splitting data into training and validation sets with their indexes
    X_train, X_val, Y_train, Y_val, I_train, I_val = s.split_with_index(
        X, Y, percentage=train_split, random_state=random_state)

    return X_train, Y_train, I_train, X_val, Y_val, I_val
예제 #8
0
import opfython.math.general as g
import opfython.stream.loader as l
import opfython.stream.parser as p
import opfython.stream.splitter as s

# Loading a .txt file to a numpy array
txt = l.load_txt('data/boat.txt')

# Parsing a pre-loaded numpy array
X, Y = p.parse_loader(txt)

# Creating a file of pre-computed distances
g.pre_compute_distance(X,
                       'boat_split_distances.txt',
                       distance='log_squared_euclidean')
예제 #9
0
import numpy as np
import pytest

from opfython.math import distance
from opfython.stream import loader, parser
from opfython.subgraphs import knn

csv = loader.load_csv('data/boat.csv')
X, Y = parser.parse_loader(csv)


def test_knn_subgraph_n_clusters():
    subgraph = knn.KNNSubgraph(X, Y)

    assert subgraph.n_clusters == 0


def test_knn_subgraph_n_clusters_setter():
    subgraph = knn.KNNSubgraph(X, Y)

    try:
        subgraph.n_clusters = 0.5
    except:
        subgraph.n_clusters = 1

    assert subgraph.n_clusters == 1

    try:
        subgraph.n_clusters = -1
    except:
        subgraph.n_clusters = 1