def test_write_dataframe_to_ts_success(tmp_path, dataset):
    """Tests whether a dataset can be written by the .ts writer then read in."""
    # load an example dataset
    path = os.path.join(
        os.path.dirname(sktime.__file__),
        f"datasets/data/{dataset}/{dataset}_TEST.ts",
    )
    test_X, test_y = load_from_tsfile_to_dataframe(path)
    # output the dataframe in a ts file
    write_dataframe_to_tsfile(
        data=test_X,
        path=tmp_path,
        problem_name=dataset,
        class_label=np.unique(test_y),
        class_value_list=test_y,
        comment="""
          The data was derived from twelve monthly electrical power demand
          time series from Italy and first used in the paper "Intelligent
          Icons: Integrating Lite-Weight Data Mining and Visualization into
          GUI Operating Systems". The classification task is to distinguish
          days from Oct to March (inclusive) from April to September.
        """,
        fold="_transform",
    )
    # load data back from the ts file
    result = f"{tmp_path}/{dataset}/{dataset}_transform.ts"
    res_X, res_y = load_from_tsfile_to_dataframe(result)
    # check if the dataframes are the same
    assert_frame_equal(res_X, test_X)
Пример #2
0
    def load(self):
        """Load dataset"""
        # load training and test set from separate files

        X_train, y_train = load_from_tsfile_to_dataframe(
            self._train_path, return_separate_X_and_y=True)
        X_test, y_test = load_from_tsfile_to_dataframe(
            self._test_path, return_separate_X_and_y=True)

        # combine into single dataframe
        data_train = pd.concat([X_train, pd.Series(y_train)], axis=1)
        data_test = pd.concat([X_test, pd.Series(y_test)], axis=1)

        # rename target variable
        data_train.rename(columns={data_train.columns[-1]: self._target_name},
                          inplace=True)
        data_test.rename(columns={data_test.columns[-1]: self._target_name},
                         inplace=True)

        # concatenate the two dataframes, keeping training and test split in
        # index, necessary for later optional CV
        data = pd.concat([data_train, data_test],
                         axis=0,
                         keys=["train", "test"]).reset_index(level=1,
                                                             drop=True)

        return data
Пример #3
0
def _load_dataset(name, split, return_X_y, extract_path=None):
    """Load time series classification datasets (helper function)."""
    # Allow user to have non standard extract path
    if extract_path is not None:
        local_module = os.path.dirname(extract_path)
        local_dirname = extract_path
    else:
        local_module = MODULE
        local_dirname = DIRNAME

    if not os.path.exists(os.path.join(local_module, local_dirname)):
        os.makedirs(os.path.join(local_module, local_dirname))
    if name not in _list_downloaded_datasets(extract_path):
        url = "http://timeseriesclassification.com/Downloads/%s.zip" % name
        # This also tests the validitiy of the URL, can't rely on the html
        # status code as it always returns 200
        try:
            _download_and_extract(
                url,
                extract_path=extract_path,
            )
        except zipfile.BadZipFile as e:
            raise ValueError(
                "Invalid dataset name. ",
                extract_path,
                "Please make sure the dataset " +
                "is available on http://timeseriesclassification.com/.",
            ) from e
    if isinstance(split, str):
        split = split.upper()

    if split in ("TRAIN", "TEST"):
        fname = name + "_" + split + ".ts"
        abspath = os.path.join(local_module, local_dirname, name, fname)
        X, y = load_from_tsfile_to_dataframe(abspath)
    # if split is None, load both train and test set
    elif split is None:
        X = pd.DataFrame(dtype="object")
        y = pd.Series(dtype="object")
        for split in ("TRAIN", "TEST"):
            fname = name + "_" + split + ".ts"
            abspath = os.path.join(local_module, local_dirname, name, fname)
            result = load_from_tsfile_to_dataframe(abspath)
            X = pd.concat([X, pd.DataFrame(result[0])])
            y = pd.concat([y, pd.Series(result[1])])
        y = pd.Series.to_numpy(y, dtype=np.str)
    else:
        raise ValueError("Invalid `split` value =", split)

    # Return appropriately
    if return_X_y:
        return X, y
    else:
        X["class_val"] = pd.Series(y)
        return X
Пример #4
0
    def __init__(self, name, train=True):
        """
        Datasets from the UEA time series archiv.

        Args:
            name: Name of the dataset.
            train: Return train split when True, test split when False.
        """

        if name not in _list_downloaded_datasets(UEA_UCR_DATA_DIR):
            url = "http://timeseriesclassification.com/Downloads/%s.zip" % name
            # This also tests the validitiy of the URL, can't rely on the html
            # status code as it always returns 200
            try:
                _download_and_extract(url, UEA_UCR_DATA_DIR)
            except zipfile.BadZipFile as e:
                raise ValueError(
                    "Invalid dataset name. Please make sure the dataset is "
                    "available on http://timeseriesclassification.com/."
                ) from e

        data_path = _build_UEA_UCR_data_path(name, train)

        self.data_x, self.data_y = load_from_tsfile_to_dataframe(data_path)
        # We do not support time series with time stamps yet. It seems as if
        # timestamps are stored in the index of the individual series. Thus
        # this check would fail if we don't have a regularly sampled time
        # series without time stamps.
        assert isinstance(self.data_x.iloc[0, 0].index, pd.RangeIndex)

        self.class_mapping = self.__build_class_mapping(name)
        self._n_classes = len(self.class_mapping.keys())
Пример #5
0
    def __build_class_mapping(name):
        """
        Build a class mapping mapping from class labels to ids of int type.

        Args:
            name: Dataset name
        Return:
            dict with dict[class_label] = class_id
        """
        train_path = _build_UEA_UCR_data_path(name, True)
        _, train_y = load_from_tsfile_to_dataframe(train_path)
        unique_labels = np.unique(train_y)
        return dict(zip(unique_labels, range(len(unique_labels))))
Пример #6
0
def read_dataset(root_dir, dataset_name):
    datasets_dict = {}
    curr_root_dir = root_dir.replace('-temp', '')

    #For UCR
    root_dir_dataset = curr_root_dir + '/' + 'UCRArchive_2018'

    x_train, y_train = load_from_tsfile_to_dataframe(root_dir_dataset + '/' +
                                                     dataset_name + '/' +
                                                     dataset_name +
                                                     '_TRAIN.ts')
    x_test, y_test = load_from_tsfile_to_dataframe(root_dir_dataset + '/' +
                                                   dataset_name + '/' +
                                                   dataset_name + '_TEST.ts')

    #x_train, y_train = load_from_arff_to_dataframe(root_dir_dataset + '/'+ dataset_name + '/' + dataset_name + '_TRAIN.arff')
    #x_test, y_test = load_from_arff_to_dataframe(root_dir_dataset + '/'+ dataset_name + '/' + dataset_name + '_TEST.arff')

    #print(x_train)

    x_train = from_nested_to_2d_array(x_train, return_numpy=True)
    x_test = from_nested_to_2d_array(x_test, return_numpy=True)

    # znorm
    std_ = x_train.std(axis=1, keepdims=True)
    std_[std_ == 0] = 1.0
    x_train = (x_train - x_train.mean(axis=1, keepdims=True)) / std_

    std_ = x_test.std(axis=1, keepdims=True)
    std_[std_ == 0] = 1.0
    x_test = (x_test - x_test.mean(axis=1, keepdims=True)) / std_

    datasets_dict[dataset_name] = (x_train.copy(), y_train.copy(),
                                   x_test.copy(), y_test.copy())

    return datasets_dict
Пример #7
0
def read_ts(filepath, **kwargs):
    """Read a ts file into Functional Data.

    Build a DenseFunctionalData or IrregularFunctionalData object upon a ts
    file passed as parameter.

    Notes
    -----
    It is assumed that the data are unidimensional. And so, it will not be
    checked.

    Parameters
    ----------
    filepath: str
        Any valid string path is acceptable.
    **kwargs:
        Keywords arguments to passed to the load_from_tsfile_to_dataframe
        function.

    Returns
    -------
    obj: DenseFunctionalData or IrregularFunctionalData
        The loaded csv file.
    labels: np.ndarray
        Labels

    """
    data, labels = load_from_tsfile_to_dataframe(filepath, **kwargs)

    len_argavals = data.applymap(len)['dim_0'].unique()

    if len(len_argavals) == 1:
        obj = read_ts_dense(data)
    else:
        obj = read_ts_irregular(data)
    return obj, labels
Пример #8
0
def test_load_from_tsfile_to_dataframe():
    """Test the load_from_tsfile_to_dataframe() function."""

    # Test that an empty file is classed an invalid

    fd, path = tempfile.mkstemp()
    try:
        with os.fdopen(fd, "w") as tmp_file:

            # Write the contents of the file

            file_contents = ""

            tmp_file.write(file_contents)
            tmp_file.flush()

            # Parse the file and assert that it is invalid

            np.testing.assert_raises(TsFileParseException,
                                     load_from_tsfile_to_dataframe, path)

    finally:
        os.remove(path)

    # Test that a file with an incomplete set of metadata is invalid

    fd, path = tempfile.mkstemp()
    try:
        with os.fdopen(fd, "w") as tmp_file:

            # Write the contents of the file

            file_contents = ("@problemName Test Problem\n@timeStamps "
                             "true\n@univariate true\n")

            tmp_file.write(file_contents)
            tmp_file.flush()

            # Parse the file and assert that it is invalid

            np.testing.assert_raises(TsFileParseException,
                                     load_from_tsfile_to_dataframe, path)

    finally:
        os.remove(path)

    # Test that a file with a complete set of metadata but no data is invalid

    fd, path = tempfile.mkstemp()
    try:
        with os.fdopen(fd, "w") as tmp_file:

            # Write the contents of the file

            file_contents = (
                "@problemName Test Problem\n@timeStamps "
                "true\n@univariate true\n@classLabel false\n@data")

            tmp_file.write(file_contents)
            tmp_file.flush()

            # Parse the file and assert that it is invalid

            np.testing.assert_raises(TsFileParseException,
                                     load_from_tsfile_to_dataframe, path)

    finally:
        os.remove(path)

    # Test that a file with a complete set of metadata and no data but
    # invalid metadata values is invalid

    fd, path = tempfile.mkstemp()
    try:
        with os.fdopen(fd, "w") as tmp_file:

            # Write the contents of the file

            file_contents = ("@problemName\n@timeStamps\n@univariate "
                             "true\n@classLabel false\n@data")

            tmp_file.write(file_contents)
            tmp_file.flush()

            # Parse the file and assert that it is invalid

            np.testing.assert_raises(TsFileParseException,
                                     load_from_tsfile_to_dataframe, path)

    finally:
        os.remove(path)

    # Test that a file with a complete set of metadata and a single
    # case/dimension parses correctly

    fd, path = tempfile.mkstemp()
    try:
        with os.fdopen(fd, "w") as tmp_file:

            # Write the contents of the file

            file_contents = ("@problemName Test Problem\n@timeStamps "
                             "true\n@univariate true\n@classLabel "
                             "false\n@data\n")
            file_contents += "(0, 1), (1, 2)"

            tmp_file.write(file_contents)
            tmp_file.flush()

            # Parse the file

            df = load_from_tsfile_to_dataframe(path)

            # Test the DataFrame returned accurately reflects the data in
            # the file

            np.testing.assert_equal(len(df), 1)
            np.testing.assert_equal(len(df.columns), 1)

            series = df["dim_0"]
            np.testing.assert_equal(len(series), 1)

            series = df["dim_0"][0]
            np.testing.assert_equal(series[0], 1.0)
            np.testing.assert_equal(series[1], 2.0)

    finally:
        os.remove(path)

    # Test that a file with a complete set of metadata and 2 cases with 3
    # dimensions parses correctly

    fd, path = tempfile.mkstemp()
    try:
        with os.fdopen(fd, "w") as tmp_file:

            # Write the contents of the file

            file_contents = ("@problemName Test Problem\n@timeStamps "
                             "true\n@univariate true\n@classLabel "
                             "false\n@data\n")
            file_contents += "(0, 1), (1, 2):(0, 3), (1, 4):(0, 5), (1, 6)\n"
            file_contents += "(0, 11), (1, 12):(0, 13), (1,14):(0, 15), (1, 16)     \n"

            tmp_file.write(file_contents)
            tmp_file.flush()

            # Parse the file

            df = load_from_tsfile_to_dataframe(path)

            # Test the DataFrame returned accurately reflects the data in
            # the file

            np.testing.assert_equal(len(df), 2)
            np.testing.assert_equal(len(df.columns), 3)

            series = df["dim_0"]
            np.testing.assert_equal(len(series), 2)

            series = df["dim_0"][0]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series[0], 1.0)
            np.testing.assert_equal(series[1], 2.0)

            series = df["dim_0"][1]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series[0], 11.0)
            np.testing.assert_equal(series[1], 12.0)

            series = df["dim_1"]
            np.testing.assert_equal(len(series), 2)

            series = df["dim_1"][0]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series[0], 3.0)
            np.testing.assert_equal(series[1], 4.0)

            series = df["dim_1"][1]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series[0], 13.0)
            np.testing.assert_equal(series[1], 14.0)

            series = df["dim_2"]
            np.testing.assert_equal(len(series), 2)

            series = df["dim_2"][0]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series[0], 5.0)
            np.testing.assert_equal(series[1], 6.0)

            series = df["dim_2"][1]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series[0], 15.0)
            np.testing.assert_equal(series[1], 16.0)

    finally:
        os.remove(path)

    # Test that a file with a complete set of metadata and time-series of
    # different length parses correctly

    fd, path = tempfile.mkstemp()
    try:
        with os.fdopen(fd, "w") as tmp_file:

            # Write the contents of the file

            file_contents = ("@problemName Test Problem\n@timeStamps "
                             "true\n@univariate true\n@classLabel "
                             "false\n@data\n")
            file_contents += "(0, 1), (1, 2):(0, 3):(0, 5), (1, 6)\n"
            file_contents += "(0, 11), (1, 12):(0, 13), (1,14):(0, 15)\n"

            tmp_file.write(file_contents)
            tmp_file.flush()

            # Parse the file

            df = load_from_tsfile_to_dataframe(path)

            # Test the DataFrame returned accurately reflects the data in
            # the file

            np.testing.assert_equal(len(df), 2)
            np.testing.assert_equal(len(df.columns), 3)

            series = df["dim_0"]
            np.testing.assert_equal(len(series), 2)

            series = df["dim_0"][0]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series[0], 1.0)
            np.testing.assert_equal(series[1], 2.0)

            series = df["dim_0"][1]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series[0], 11.0)
            np.testing.assert_equal(series[1], 12.0)

            series = df["dim_1"]
            np.testing.assert_equal(len(series), 2)

            series = df["dim_1"][0]
            np.testing.assert_equal(len(series), 1)
            np.testing.assert_equal(series[0], 3.0)

            series = df["dim_1"][1]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series[0], 13.0)
            np.testing.assert_equal(series[1], 14.0)

            series = df["dim_2"]
            np.testing.assert_equal(len(series), 2)

            series = df["dim_2"][0]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series[0], 5.0)
            np.testing.assert_equal(series[1], 6.0)

            series = df["dim_2"][1]
            np.testing.assert_equal(len(series), 1)
            np.testing.assert_equal(series[0], 15.0)

    finally:
        os.remove(path)

    # Test that a file with a complete set of metadata and data but an
    # inconsistent number of dimensions across cases is classed as invalid

    fd, path = tempfile.mkstemp()
    try:
        with os.fdopen(fd, "w") as tmp_file:

            # Write the contents of the file

            file_contents = ("@problemName Test Problem\n@timeStamps "
                             "true\n@univariate true\n@classLabel "
                             "false\n@data\n")
            file_contents += "(0, 1), (1, 2):(0, 3), (1, 4):(0, 5), (1, 6)\n"
            file_contents += "(0, 11), (1, 12):(0, 13), (1,14)    \n"

            tmp_file.write(file_contents)
            tmp_file.flush()

            # Parse the file and assert that it is invalid

            np.testing.assert_raises(TsFileParseException,
                                     load_from_tsfile_to_dataframe, path)

    finally:
        os.remove(path)

    # Test that a file with a complete set of metadata and data but missing
    # values after a tuple is classed as invalid

    fd, path = tempfile.mkstemp()
    try:
        with os.fdopen(fd, "w") as tmp_file:

            # Write the contents of the file

            file_contents = ("@problemName Test Problem\n@timeStamps "
                             "true\n@univariate true\n@classLabel "
                             "false\n@data\n")
            file_contents += "(0, 1), (1, 2):(0, 3), (1, 4):(0, 5),\n"

            tmp_file.write(file_contents)
            tmp_file.flush()

            # Parse the file and assert that it is invalid

            np.testing.assert_raises(TsFileParseException,
                                     load_from_tsfile_to_dataframe, path)

    finally:
        os.remove(path)

    # Test that a file with a complete set of metadata and data and some
    # empty dimensions is classed as valid

    fd, path = tempfile.mkstemp()
    try:
        with os.fdopen(fd, "w") as tmp_file:

            # Write the contents of the file

            file_contents = ("@problemName Test Problem\n@timeStamps "
                             "true\n@univariate true\n@classLabel "
                             "false\n@data\n")
            file_contents += "(0, 1), (1, 2):     :(0, 5), (1, 6)\n"
            file_contents += "(0, 11), (1, 12):(0, 13), (1,14)    :       \n"
            file_contents += (
                "(0, 21), (1, 22):(0, 23), (1,24)    :   (0,25), (1, 26)    \n"
            )

            tmp_file.write(file_contents)
            tmp_file.flush()

            # Parse the file

            df = load_from_tsfile_to_dataframe(path)

            # Test the DataFrame returned accurately reflects the data in
            # the file

            np.testing.assert_equal(len(df), 3)
            np.testing.assert_equal(len(df.columns), 3)

            series = df["dim_0"]
            np.testing.assert_equal(len(series), 3)

            series = df["dim_0"][0]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series[0], 1.0)
            np.testing.assert_equal(series[1], 2.0)

            series = df["dim_0"][1]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series[0], 11.0)
            np.testing.assert_equal(series[1], 12.0)

            series = df["dim_0"][2]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series[0], 21.0)
            np.testing.assert_equal(series[1], 22.0)

            series = df["dim_1"]
            np.testing.assert_equal(len(series), 3)

            series = df["dim_1"][0]
            np.testing.assert_equal(len(series), 0)

            series = df["dim_1"][1]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series[0], 13.0)
            np.testing.assert_equal(series[1], 14.0)

            series = df["dim_1"][2]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series[0], 23.0)
            np.testing.assert_equal(series[1], 24.0)

            series = df["dim_2"]
            np.testing.assert_equal(len(series), 3)

            series = df["dim_2"][0]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series[0], 5.0)
            np.testing.assert_equal(series[1], 6.0)

            series = df["dim_2"][1]
            np.testing.assert_equal(len(series), 0)

            series = df["dim_2"][2]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series[0], 25.0)
            np.testing.assert_equal(series[1], 26.0)

    finally:
        os.remove(path)

    # Test that a file with a complete set of metadata and data that
    # contains datetimes as timestamps and has some empty dimensions is
    # classed as valid

    fd, path = tempfile.mkstemp()
    try:
        with os.fdopen(fd, "w") as tmp_file:

            # Write the contents of the file

            file_contents = ("@problemName Test Problem\n@timeStamps "
                             "true\n@univariate true\n@classLabel "
                             "false\n@data\n")
            file_contents += ("(01/01/2019 00:00:00, 1),  (01/02/2019 "
                              "00:00:00, 2)  :                               "
                              "                      : (01/05/2019 00:00:00, "
                              "5), (01/06/2019 00:00:00, 6)\n")
            file_contents += ("(01/01/2020 00:00:00, 11), (01/02/2020 "
                              "00:00:00, 12) : (01/03/2020 00:00:00, 13), "
                              "(01/04/2020 00:00:00, 14) :  \n")
            file_contents += ("(01/01/2021 00:00:00, 21), (01/02/2021 "
                              "00:00:00, 22) : (01/03/2021 00:00:00, 23), "
                              "(01/04/2021 00:00:00, 24) :  \n")

            tmp_file.write(file_contents)
            tmp_file.flush()

            # Parse the file

            df = load_from_tsfile_to_dataframe(path)

            # Test the DataFrame returned accurately reflects the data in
            # the file

            np.testing.assert_equal(len(df), 3)
            np.testing.assert_equal(len(df.columns), 3)

            series = df["dim_0"]
            np.testing.assert_equal(len(series), 3)

            series = df["dim_0"][0]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series["01/01/2019"], 1.0)
            np.testing.assert_equal(series["01/02/2019"], 2.0)

            series = df["dim_0"][1]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series["01/01/2020"], 11.0)
            np.testing.assert_equal(series["01/02/2020"], 12.0)

            series = df["dim_0"][2]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series["01/01/2021"], 21.0)
            np.testing.assert_equal(series["01/02/2021"], 22.0)

            series = df["dim_1"]
            np.testing.assert_equal(len(series), 3)

            series = df["dim_1"][0]
            np.testing.assert_equal(len(series), 0)

            series = df["dim_1"][1]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series["01/03/2020"], 13.0)
            np.testing.assert_equal(series["01/04/2020"], 14.0)

            series = df["dim_1"][2]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series["01/03/2021"], 23.0)
            np.testing.assert_equal(series["01/04/2021"], 24.0)

            series = df["dim_2"]
            np.testing.assert_equal(len(series), 3)

            series = df["dim_2"][0]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series["01/05/2019"], 5.0)
            np.testing.assert_equal(series["01/06/2019"], 6.0)

            series = df["dim_2"][1]
            np.testing.assert_equal(len(series), 0)

            series = df["dim_2"][2]
            np.testing.assert_equal(len(series), 0)

    finally:
        os.remove(path)

    # Test that a file that mixes timestamp conventions is invalid

    fd, path = tempfile.mkstemp()
    try:
        with os.fdopen(fd, "w") as tmp_file:

            # Write the contents of the file

            file_contents = ("@problemName Test Problem\n@timeStamps "
                             "true\n@univariate true\n@classLabel "
                             "false\n@data\n")
            file_contents += ("(01/01/2019 00:00:00, 1),  (01/02/2019 "
                              "00:00:00, 2)  :                               "
                              "                      : (01/05/2019 00:00:00, "
                              "5), (01/06/2019 00:00:00, 6)\n")
            file_contents += ("(00, 11), (1, 12) : (01/03/2020 00:00:00, 13), "
                              "(01/04/2020 00:00:00, 14) :  \n")
            file_contents += ("(01/01/2021 00:00:00, 21), (01/02/2021 "
                              "00:00:00, 22) : (01/03/2021 00:00:00, 23), "
                              "(01/04/2021 00:00:00, 24) :  \n")

            tmp_file.write(file_contents)
            tmp_file.flush()

            # Parse the file and assert that it is invalid

            np.testing.assert_raises(TsFileParseException,
                                     load_from_tsfile_to_dataframe, path)

    finally:
        os.remove(path)

    # Test that a file with a complete set of metadata and data but missing
    # classes is classed as invalid

    fd, path = tempfile.mkstemp()
    try:
        with os.fdopen(fd, "w") as tmp_file:

            # Write the contents of the file

            file_contents = ("@problemName Test Problem\n@timeStamps "
                             "true\n@univariate true\n@classLabel true 0 1 "
                             "2\n@data\n")
            file_contents += "(0, 1), (1, 2):(0, 3), (1, 4):(0, 5), (1, 6)\n"
            file_contents += "(0, 11), (1, 12):(0, 13), (1,14):(0, 15), (1, 16)     \n"

            tmp_file.write(file_contents)
            tmp_file.flush()

            # Parse the file and assert that it is invalid

            np.testing.assert_raises(TsFileParseException,
                                     load_from_tsfile_to_dataframe, path)

    finally:
        os.remove(path)

    # Test that a file with a complete set of metadata and data but invalid
    # classes is classed as invalid

    fd, path = tempfile.mkstemp()
    try:
        with os.fdopen(fd, "w") as tmp_file:

            # Write the contents of the file

            file_contents = ("@problemName Test Problem\n@timeStamps "
                             "true\n@univariate true\n@classLabel true 0 1 "
                             "2\n@data\n")
            file_contents += "(0, 1), (1, 2):(0, 3), (1, 4):(0, 5), (1, 6) : 0 \n"
            file_contents += (
                "(0, 11), (1, 12):(0, 13), (1,14):(0, 15), (1, 16)   : 3  \n")

            tmp_file.write(file_contents)
            tmp_file.flush()

            # Parse the file and assert that it is invalid

            np.testing.assert_raises(TsFileParseException,
                                     load_from_tsfile_to_dataframe, path)

    finally:
        os.remove(path)

    # Test that a file with a complete set of metadata and data with classes
    # is classed as valid

    fd, path = tempfile.mkstemp()
    try:
        with os.fdopen(fd, "w") as tmp_file:

            # Write the contents of the file

            file_contents = ("@problemName Test Problem\n@timeStamps "
                             "true\n@univariate true\n@classLabel true 0 1 "
                             "2\n@data\n")
            file_contents += "(0, 1), (1, 2):(0, 3), (1, 4):(0, 5), (1, 6): 0\n"
            file_contents += (
                "(0, 11), (1, 12):(0, 13), (1,14):(0, 15), (1, 16): 2     \n")

            tmp_file.write(file_contents)
            tmp_file.flush()

            # Parse the file

            df, y = load_from_tsfile_to_dataframe(path)

            # Test the DataFrame of X values returned accurately reflects
            # the data in the file

            np.testing.assert_equal(len(df), 2)
            np.testing.assert_equal(len(df.columns), 3)

            series = df["dim_0"]
            np.testing.assert_equal(len(series), 2)

            series = df["dim_0"][0]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series[0], 1.0)
            np.testing.assert_equal(series[1], 2.0)

            series = df["dim_0"][1]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series[0], 11.0)
            np.testing.assert_equal(series[1], 12.0)

            series = df["dim_1"]
            np.testing.assert_equal(len(series), 2)

            series = df["dim_1"][0]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series[0], 3.0)
            np.testing.assert_equal(series[1], 4.0)

            series = df["dim_1"][1]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series[0], 13.0)
            np.testing.assert_equal(series[1], 14.0)

            series = df["dim_2"]
            np.testing.assert_equal(len(series), 2)

            series = df["dim_2"][0]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series[0], 5.0)
            np.testing.assert_equal(series[1], 6.0)

            series = df["dim_2"][1]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series[0], 15.0)
            np.testing.assert_equal(series[1], 16.0)

            # Test that the class values are as expected

            np.testing.assert_equal(len(y), 2)
            np.testing.assert_equal(y[0], "0")
            np.testing.assert_equal(y[1], "2")

    finally:
        os.remove(path)

    # Test that a file with a complete set of metadata and data, with no
    # timestamps, is classed as valid

    fd, path = tempfile.mkstemp()
    try:
        with os.fdopen(fd, "w") as tmp_file:

            # Write the contents of the file

            file_contents = ("@problemName Test Problem\n@timeStamps "
                             "false\n@univariate true\n@classLabel "
                             "false\n@data\n")
            file_contents += "1,2:3,4:5,6\n"
            file_contents += "11,12:13,14:15,16\n"
            file_contents += "21,22:23,24:25,26\n"

            tmp_file.write(file_contents)
            tmp_file.flush()

            # Parse the file

            df = load_from_tsfile_to_dataframe(path)

            # Test the DataFrame returned accurately reflects the data in
            # the file

            np.testing.assert_equal(len(df), 3)
            np.testing.assert_equal(len(df.columns), 3)

            series = df["dim_0"]
            np.testing.assert_equal(len(series), 3)

            series = df["dim_0"][0]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series[0], 1.0)
            np.testing.assert_equal(series[1], 2.0)

            series = df["dim_0"][1]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series[0], 11.0)
            np.testing.assert_equal(series[1], 12.0)

            series = df["dim_0"][2]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series[0], 21.0)
            np.testing.assert_equal(series[1], 22.0)

            series = df["dim_1"]
            np.testing.assert_equal(len(series), 3)

            series = df["dim_1"][0]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series[0], 3.0)
            np.testing.assert_equal(series[1], 4.0)

            series = df["dim_1"][1]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series[0], 13.0)
            np.testing.assert_equal(series[1], 14.0)

            series = df["dim_1"][2]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series[0], 23.0)
            np.testing.assert_equal(series[1], 24.0)

            series = df["dim_2"]
            np.testing.assert_equal(len(series), 3)

            series = df["dim_2"][0]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series[0], 5.0)
            np.testing.assert_equal(series[1], 6.0)

            series = df["dim_2"][1]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series[0], 15.0)
            np.testing.assert_equal(series[1], 16.0)

            series = df["dim_2"][2]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series[0], 25.0)
            np.testing.assert_equal(series[1], 26.0)

    finally:
        os.remove(path)

    # Test that a file with a complete set of metadata and data, with no
    # timestamps and some empty dimensions, is classed as valid

    fd, path = tempfile.mkstemp()
    try:
        with os.fdopen(fd, "w") as tmp_file:

            # Write the contents of the file

            file_contents = ("@problemName Test Problem\n@timeStamps "
                             "false\n@univariate true\n@classLabel "
                             "false\n@data\n")
            file_contents += "1,2::5,6\n"
            file_contents += "11,12:13,14:15,16\n"
            file_contents += "21,22:23,24:\n"

            tmp_file.write(file_contents)
            tmp_file.flush()

            # Parse the file

            df = load_from_tsfile_to_dataframe(path)

            # Test the DataFrame returned accurately reflects the data in
            # the file

            np.testing.assert_equal(len(df), 3)
            np.testing.assert_equal(len(df.columns), 3)

            series = df["dim_0"]
            np.testing.assert_equal(len(series), 3)

            series = df["dim_0"][0]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series[0], 1.0)
            np.testing.assert_equal(series[1], 2.0)

            series = df["dim_0"][1]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series[0], 11.0)
            np.testing.assert_equal(series[1], 12.0)

            series = df["dim_0"][2]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series[0], 21.0)
            np.testing.assert_equal(series[1], 22.0)

            series = df["dim_1"]
            np.testing.assert_equal(len(series), 3)

            series = df["dim_1"][0]
            np.testing.assert_equal(len(series), 0)

            series = df["dim_1"][1]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series[0], 13.0)
            np.testing.assert_equal(series[1], 14.0)

            series = df["dim_1"][2]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series[0], 23.0)
            np.testing.assert_equal(series[1], 24.0)

            series = df["dim_2"]
            np.testing.assert_equal(len(series), 3)

            series = df["dim_2"][0]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series[0], 5.0)
            np.testing.assert_equal(series[1], 6.0)

            series = df["dim_2"][1]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series[0], 15.0)
            np.testing.assert_equal(series[1], 16.0)

            series = df["dim_2"][2]
            np.testing.assert_equal(len(series), 0)

    finally:
        os.remove(path)

    # Test that a file with a complete set of metadata and data, with no
    # timestamps and some empty dimensions and classes, is classed as valid

    fd, path = tempfile.mkstemp()
    try:
        with os.fdopen(fd, "w") as tmp_file:
            # Write the contents of the file

            file_contents = ("@problemName Test Problem\n@timeStamps "
                             "false\n@univariate true\n@classLabel true cat "
                             "bear dog\n@data\n")
            file_contents += "1,2::5,6:cat  \n"
            file_contents += "11,12:13,14:15,16:  dog\n"
            file_contents += "21,22:23,24::   bear   \n"

            tmp_file.write(file_contents)
            tmp_file.flush()

            # Parse the file

            df, y = load_from_tsfile_to_dataframe(path)

            # Test the DataFrame of X values returned accurately reflects
            # the data in the file

            np.testing.assert_equal(len(df), 3)
            np.testing.assert_equal(len(df.columns), 3)

            series = df["dim_0"]
            np.testing.assert_equal(len(series), 3)

            series = df["dim_0"][0]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series[0], 1.0)
            np.testing.assert_equal(series[1], 2.0)

            series = df["dim_0"][1]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series[0], 11.0)
            np.testing.assert_equal(series[1], 12.0)

            series = df["dim_0"][2]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series[0], 21.0)
            np.testing.assert_equal(series[1], 22.0)

            series = df["dim_1"]
            np.testing.assert_equal(len(series), 3)

            series = df["dim_1"][0]
            np.testing.assert_equal(len(series), 0)

            series = df["dim_1"][1]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series[0], 13.0)
            np.testing.assert_equal(series[1], 14.0)

            series = df["dim_1"][2]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series[0], 23.0)
            np.testing.assert_equal(series[1], 24.0)

            series = df["dim_2"]
            np.testing.assert_equal(len(series), 3)

            series = df["dim_2"][0]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series[0], 5.0)
            np.testing.assert_equal(series[1], 6.0)

            series = df["dim_2"][1]
            np.testing.assert_equal(len(series), 2)
            np.testing.assert_equal(series[0], 15.0)
            np.testing.assert_equal(series[1], 16.0)

            series = df["dim_2"][2]
            np.testing.assert_equal(len(series), 0)

            # Test that the class values are as expected

            np.testing.assert_equal(len(y), 3)
            np.testing.assert_equal(y[0], "cat")
            np.testing.assert_equal(y[1], "dog")
            np.testing.assert_equal(y[2], "bear")

    finally:
        os.remove(path)
Пример #9
0
def loadDataset(dataset):
    # * Data loads
    # dataset = "handwritting"
    if dataset == "motions":
        X, y = load_basic_motions(return_X_y=True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

        x_train = getValues(X_train)
        x_train = x_train.transpose([0,2,1])
        x_test = getValues(X_test)
        x_test = x_test.transpose([0,2,1])
        N, D = X_train.shape
        T = X_train.to_numpy()[0][0].to_numpy().shape[0]
        y_train = y_train.to_numpy()
        y_test = y_test.to_numpy()
        variables = ["accelerometer-x", "accelerometer-y", "accelerometer-z", "gyroscope-x", "gyroscope-y", "gyroscope-z"]
    else:
        if dataset == "wafer":
            X_train, y_train = load_from_tsfile_to_dataframe('datasets/Wafer/Wafer_TRAIN.ts')
            X_test, y_test = load_from_tsfile_to_dataframe('datasets/Wafer/Wafer_TRAIN.ts')
            variables = ["sensor"]
        elif dataset == "libras":
            X_train, y_train = load_from_tsfile_to_dataframe('datasets/Libras/Libras_TRAIN.ts')
            X_test, y_test = load_from_tsfile_to_dataframe('datasets/Libras/Libras_TEST.ts')
            variables = ["x", "y"]
        elif dataset == "uwave":
            X_train, y_train = load_from_tsfile_to_dataframe('datasets/UWaveGestureLibraryAll/UWaveGestureLibraryAll_TRAIN.ts')
            X_test, y_test = load_from_tsfile_to_dataframe('datasets/UWaveGestureLibraryAll/UWaveGestureLibraryAll_TEST.ts')
            X_test = X_test[:1000]
            y_test = y_test[:1000]
            variables = ["accelerometer"]
        elif dataset == "stand":
            X_train, y_train = load_from_tsfile_to_dataframe('datasets/StandWalkJump/StandWalkJump_TRAIN.ts')
            X_test, y_test = load_from_tsfile_to_dataframe('datasets/StandWalkJump/StandWalkJump_TEST.ts')
            variables = ["ECG-1", "ECG-2", "ECG-3", "ECG-4"]
        elif dataset == "handwritting":
            X_train, y_train = load_from_tsfile_to_dataframe('datasets/Handwriting/Handwriting_TRAIN.ts')
            X_test, y_test = load_from_tsfile_to_dataframe('datasets/Handwriting/Handwriting_TEST.ts')
            variables = ["accelerometer-x", "accelerometer-y", "accelerometer-z"]

        N, D = X_train.shape
        N_te = X_test.shape[0]
        T = np.array(X_train.to_numpy()[0][0]).shape[0]
        x_train = np.zeros([N, D, T])
        x_test = np.zeros([N_te, D, T])
        for i in range(N):
            for j in range(D):
                x_train[i][j] = np.array(X_train.to_numpy()[i][j])
        
        for i in range(N_te):
            for j in range(D):
                x_test[i][j] = np.array(X_test.to_numpy()[i][j])
        # print(x_train.shape)
        x_train = x_train.transpose([0, 2, 1])
        x_test = x_test.transpose([0, 2, 1])
        
        # print(y_train)
        # x_test, y_test = load_from_tsfile_to_dataframe('datasets/Wafer/Wafer_TEST.ts')


    # * scale data

    x_train = x_train.transpose([2,0,1])
    x_test = x_test.transpose([2,0,1])
    for i in range(D):
        x_train[i], scaler = scaleSerie(x_train[i])
        x_test[i], _ = scaleSerie(x_test[i], scaler)
    x_train = x_train.transpose([1,2,0])
    x_test = x_test.transpose([1,2,0])

    # * get data labels
    labels = np.unique(y_train)
    lb = preprocessing.LabelBinarizer()
    lb.fit(labels)
    y_train = lb.transform(y_train)
    y_test = lb.transform(y_test)
    if(len(labels) == 2):
        y_train = np.hstack((y_train, 1 - y_train))
        y_test = np.hstack((y_test, 1 - y_test))
    y_train_int = [labelInt(label) for label in y_train]
    y_test_int = [labelInt(label) for label in y_test]



    X = np.concatenate([x_train, x_test])
    y = np.concatenate([y_train_int, y_test_int])
    y = np.expand_dims(y, axis=1)

    return X, y
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            random_state=42)

        x_train = getValues(X_train)
        x_train = x_train.transpose([0, 2, 1])
        x_test = getValues(X_test)
        x_test = x_test.transpose([0, 2, 1])
        N, D = X_train.shape
        T = X_train.to_numpy()[0][0].to_numpy().shape[0]
        y_train = y_train.to_numpy()
        y_test = y_test.to_numpy()
        N_te = X_test.shape[0]
    else:
        if args["dataset"] == "wafer":
            X_train, y_train = load_from_tsfile_to_dataframe(
                'datasets/Wafer/Wafer_TRAIN.ts')
            X_test, y_test = load_from_tsfile_to_dataframe(
                'datasets/Wafer/Wafer_TRAIN.ts')
        elif args["dataset"] == "libras":
            X_train, y_train = load_from_tsfile_to_dataframe(
                'datasets/Libras/Libras_TRAIN.ts')
            X_test, y_test = load_from_tsfile_to_dataframe(
                'datasets/Libras/Libras_TEST.ts')
        elif args["dataset"] == "uwave":
            X_train, y_train = load_from_tsfile_to_dataframe(
                'datasets/UWaveGestureLibraryAll/UWaveGestureLibraryAll_TRAIN.ts'
            )
            X_test, y_test = load_from_tsfile_to_dataframe(
                'datasets/UWaveGestureLibraryAll/UWaveGestureLibraryAll_TEST.ts'
            )
        elif args["dataset"] == "stand":
Пример #11
0
    def _process_data(self):
        root = self.root
        data_loc = (root / "UEA" / "Multivariate_ts" /
                    "CharacterTrajectories" / "CharacterTrajectories")

        train_X, train_y = load_from_tsfile_to_dataframe(
            str(data_loc) + "_TRAIN.ts")
        test_X, test_y = load_from_tsfile_to_dataframe(
            str(data_loc) + "_TEST.ts")
        train_X = train_X.to_numpy()
        test_X = test_X.to_numpy()
        X = np.concatenate((train_X, test_X), axis=0)
        y = np.concatenate((train_y, test_y), axis=0)

        lengths = torch.tensor([len(Xi[0]) for Xi in X])
        # final_index = lengths - 1
        maxlen = lengths.max()

        # Each channel is a pandas.core.series.Series object of length corresponding to the length of the time series
        X = torch.stack(
            [
                torch.stack([pad(channel, maxlen) for channel in batch], dim=0)
                for batch in X
            ],
            dim=0,
        )

        # Now fix the labels to be integers from 0 upwards
        targets = co.OrderedDict()
        counter = 0
        for yi in y:
            if yi not in targets:
                targets[yi] = counter
                counter += 1
        y = torch.tensor([targets[yi] for yi in y])

        # If dropped is different than zero, randomly drop that quantity of data from the dataset.
        if self.dropped_rate != 0:
            generator = torch.Generator().manual_seed(56789)
            X_removed = []
            for Xi in X:
                removed_points = (torch.randperm(
                    X.shape[-1],
                    generator=generator)[:int(X.shape[-1] *
                                              float(self.dropped_rate) /
                                              100.0)].sort().values)
                Xi_removed = Xi.clone()
                Xi_removed[:, removed_points] = float("nan")
                X_removed.append(Xi_removed)
            X = torch.stack(X_removed, dim=0)

        # Normalize data
        X = normalise_data(X, y)

        # Once the data is normalized append times and mask values if required.
        if self.dropped_rate != 0:
            # Get mask of possitions that are deleted (Only first channel required
            # as all channels eliminated synchronously).
            mask_exists = (~torch.isnan(X[:, :1, :])).float()
            X = torch.where(~torch.isnan(X), X, torch.Tensor([0.0]))
            X = torch.cat([X, mask_exists], dim=1)

        train_X, val_X, test_X = split_data(X, y)
        train_y, val_y, test_y = split_data(y, y)

        return (
            train_X,
            val_X,
            test_X,
            train_y,
            val_y,
            test_y,
        )
Пример #12
0
def create_subsample(input_dir, UCR_list, output_dir):

    for db_name_ite in UCR_list.values:

        db_name = db_name_ite[0]

        train_x, train_y = load_from_tsfile_to_dataframe(
            "%s/%s/%s_TRAIN.ts" % (input_dir, db_name, db_name))
        test_x, test_y = load_from_tsfile_to_dataframe(
            "%s/%s/%s_TEST.ts" % (input_dir, db_name, db_name))
        data = np.zeros((len(train_y) + len(test_y), len(train_x.iloc[1, 0])))

        for i in range(0, len(train_y)):
            data[i, :] = train_x.iloc[i, :][0]

        k = 0
        for i in range(len(train_y), len(train_y) + len(test_y)):
            data[i, :] = test_x.iloc[k, :][0]
            k = k + 1

        classes = np.concatenate((train_y, test_y))
        classes = classes.astype(int)

        l = data.shape[0]

        if l < 100:
            subratio = 0.8
        elif l < 300:
            subratio = 0.6
        elif l < 800:
            subratio = 0.4
        elif l < 1500:
            subratio = 0.2
        elif l < 5000:
            subratio = 0.1
        else:
            subratio = 0.05

        while l * subratio / len(np.unique(classes)) < 10:
            subratio = subratio + 0.1

        if subratio > 0.8:
            subratio = 0.8

        s = StratifiedShuffleSplit(test_size=subratio / 2,
                                   train_size=subratio / 2)
        train_index, test_index = next(s.split(data, classes))

        data_df = np.concatenate((data[train_index, :], data[test_index, :]))
        classes_df = np.concatenate(
            (classes[train_index], classes[test_index]))
        data_df = np.column_stack((data_df, classes_df))

        df = pd.DataFrame(data_df)
        attributes = [(c.astype(str), 'NUMERIC')
                      for c in df.columns.values[:-1]]
        t = df.columns[-1]
        attributes += [('target', df[t].unique().astype(str).tolist())]

        data = [
            df.loc[i].values[:-1].tolist() + [df[t].loc[i]]
            for i in range(df.shape[0])
        ]

        arff_dic = {
            'attributes': attributes,
            'data': data,
            'relation': db_name,
            'description': ''
        }
        if not os.path.exists("%s/%s" % (output_dir, db_name)):
            os.makedirs("%s/%s" % (output_dir, db_name))
        with open("%s/%s/%s.arff" % (output_dir, db_name, db_name),
                  "w",
                  encoding="utf8") as f:
            arff.dump(arff_dic, f)
        print("%s created" % db_name)

    print("Subsample finished!")

    return
Пример #13
0
nbDatasets = 0
for  dataset in os.listdir(DATA_PATH): nbDatasets+=1
print("Evaluating on %s cores %d classifiers on %d datasets with a %d fold cross-validation..." % 
     ("all" if nb_jobs == -1 else str(nb_jobs), len(classifiers), nbDatasets, nb_split))

start_global_time = time.perf_counter()

# Evaluates all classifiers using cross validation per dataset
for dataset in os.listdir(DATA_PATH):
    # Loads dataset for cross validation
    print("\nLoading %s dataset..." % dataset)
    start_load_time = time.perf_counter()

    filepath = dataset+"/"+dataset+"_" # dataset/dataset_TEST.ts and dataset/dataset_TRAIN.ts
    # Load train data + class
    d, c = load_from_tsfile_to_dataframe(os.path.join(DATA_PATH, filepath+"TRAIN.ts"))
    # Load test data + class
    dd, cc = load_from_tsfile_to_dataframe(os.path.join(DATA_PATH, filepath+"TEST.ts"))

    # Store all data and all class (concatenate train and test)
    data, classes = d.append(dd), np.concatenate((c, cc))

    elapsed_load_time = time.perf_counter() - start_load_time
    print("Loading took: %f seconds" % elapsed_load_time)

    # Now we will do all cross-validations on this dataset
    for classifier, classifier_name in classifiers:
        print("Classifier: "+classifier_name)
        start_time = time.perf_counter()
        # cross-validation
        scores = cross_val_score(classifier, data, classes, cv=cv, n_jobs=nb_jobs)