Пример #1
0
    def test_defaults_produce_clusters(self) -> None:
        # load test data into a dataframe
        dataset = test_utils.load_dataset(self._dataset_path)
        dataframe = test_utils.get_dataframe(dataset, "learningData")

        # mark grouping key
        dataframe.metadata = dataframe.metadata.add_semantic_type(
            (metadata_base.ALL_ELEMENTS, 1),
            "https://metadata.datadrivendiscovery.org/types/GroupingKey",
        )

        # create the clustering primitive and cluster
        hyperparams_class = Sloth.metadata.query()["primitive_code"][
            "class_type_arguments"
        ]["Hyperparams"]
        sloth = Sloth(hyperparams=hyperparams_class.defaults())
        result = sloth.produce_clusters(inputs=dataframe).value

        # check that the grouping key columns match
        self.assertListEqual(
            result.iloc[:, 0].tolist(), ["alpha", "bravo", "charlie", "delta"]
        )

        # check that the first two keys are in their own cluster, and the the last two are in
        # the same cluster
        clusters_by_key = result[["key", "__cluster"]].drop_duplicates()
        self.assertNotEquals(clusters_by_key.iloc[0, 1], clusters_by_key.iloc[1, 1])
        self.assertNotEquals(clusters_by_key.iloc[1, 1], clusters_by_key.iloc[2, 1])
        self.assertEquals(clusters_by_key.iloc[2, 1], clusters_by_key.iloc[3, 1])

        # check metadata is correct
        column_metadata = result.metadata.query_column(0)
        self.assertEqual(column_metadata["structural_type"], str)
        column_metadata = result.metadata.query_column(1)
        self.assertEqual(column_metadata["structural_type"], np.int64)
    def test_vector_parse_twice(self) -> None:
        dataset = test_utils.load_dataset(self._image_dataset_path)
        df = test_utils.get_dataframe(dataset, "learningData")

        hyperparams_class = ColumnParserPrimitive.metadata.get_hyperparams()
        cpp = ColumnParserPrimitive(
            hyperparams=hyperparams_class.defaults().replace(
                {
                    "parsing_semantics": [
                        "https://metadata.datadrivendiscovery.org/types/FloatVector",
                    ]
                }
            )
        )
        target_coords = [
            20.999598,
            63.488694,
            20.999598,
            63.499462,
            21.023702,
            63.499462,
            21.023702,
            63.488694,
        ]
        result_df = cpp.produce(inputs=df).value
        result_coords = result_df["coordinates"][0]
        self.assertEquals(len(result_coords), len(target_coords))
        for a, b in zip(target_coords, result_coords):
            self.assertAlmostEqual(a, b, 5)

        result_2_df = cpp.produce(inputs=result_df).value
        result_2_coords = result_2_df["coordinates"][0]
        self.assertEquals(len(result_2_coords), len(target_coords))
        for a, b in zip(target_coords, result_2_coords):
            self.assertAlmostEqual(a, b, 5)
Пример #3
0
def _test_set_training_data(dataset_name,
                            target_col,
                            group_compose=False,
                            split_train=False):
    dataset = test_utils.load_dataset(
        f'/datasets/seed_datasets_current/{dataset_name}/TRAIN/dataset_TRAIN')
    df = test_utils.get_dataframe(dataset, 'learningData', target_col)
    time_col = df.metadata.list_columns_with_semantic_types((
        "https://metadata.datadrivendiscovery.org/types/Time",
        "http://schema.org/DateTime",
    ))[0]
    original_times = df.iloc[:, time_col]
    df.iloc[:, time_col] = pd.to_datetime(
        df.iloc[:, time_col], format=datetime_format_strs[dataset_name])
    df = df.sort_values(by=df.columns[time_col])
    df.iloc[:, time_col] = original_times
    train_split = int(0.9 * df.shape[0])
    train = df.iloc[:train_split, :].reset_index(drop=True)
    val = df.iloc[train_split:, :].reset_index(drop=True)
    df = df.reset_index(drop=True)

    preprocess = PreProcessPipeline(group_compose=group_compose)
    preprocess.fit(train)
    train_inputs, train_outputs = preprocess.produce(train)
    val_inputs, _ = preprocess.produce(val)
    all_inputs, all_outputs = preprocess.produce(df)
    deepar_hp = DeepArPrimitive.metadata.query(
    )['primitive_code']['class_type_arguments']['Hyperparams']

    pred_length_idx = 1 if split_train else 0
    deepar_hp = DeepArPrimitive.metadata.query(
    )['primitive_code']['class_type_arguments']['Hyperparams']
    deepar = DeepArPrimitive(hyperparams=deepar_hp(
        deepar_hp.defaults(),
        epochs=1,
        steps_per_epoch=1,
        number_samples=10,
        prediction_length=min_pred_lengths[dataset_name][pred_length_idx] + 5,
        context_length=min_pred_lengths[dataset_name][pred_length_idx] - 5,
        quantiles=(0.1, 0.9),
        output_mean=False))
    if split_train:
        deepar.set_training_data(inputs=train_inputs, outputs=train_outputs)
    else:
        deepar.set_training_data(inputs=all_inputs, outputs=all_outputs)

    if group_compose:
        assert deepar._grouping_columns == [train_inputs.shape[1] - 1]
    else:
        assert grouping_cols[dataset_name] == deepar._grouping_columns
    assert freqs[dataset_name] == deepar._freq
    assert real_cols[dataset_name] == deepar._real_columns
    assert isinstance(deepar._deepar_dataset.get_distribution_type(),
                      distr[dataset_name])
    deepar.fit()
    return deepar, preprocess, train_inputs, val_inputs, all_inputs
def _test_set_training_data(dataset_name,
                            target_col,
                            group_compose=False,
                            split_train=False):
    dataset = test_utils.load_dataset(
        f'/datasets/seed_datasets_current/{dataset_name}/TRAIN/dataset_TRAIN')
    df = test_utils.get_dataframe(dataset, 'learningData', target_col)
    time_col = df.metadata.list_columns_with_semantic_types((
        "https://metadata.datadrivendiscovery.org/types/Time",
        "http://schema.org/DateTime",
    ))[0]
    original_times = df.iloc[:, time_col]
    df.iloc[:, time_col] = pd.to_datetime(
        df.iloc[:, time_col], format=datetime_format_strs[dataset_name])
    df = df.sort_values(by=df.columns[time_col])
    df.iloc[:, time_col] = original_times
    train_split = int(0.9 * df.shape[0])
    train = df.iloc[:train_split, :].reset_index(drop=True)
    val = df.iloc[train_split:, :].reset_index(drop=True)
    df = df.reset_index(drop=True)

    preprocess = PreProcessPipeline(group_compose=group_compose)
    preprocess.fit(train)
    train_inputs, train_outputs = preprocess.produce(train)
    val_inputs, _ = preprocess.produce(val)
    all_inputs, all_outputs = preprocess.produce(df)
    nbeats_hp = NBEATSPrimitive.metadata.query(
    )['primitive_code']['class_type_arguments']['Hyperparams']

    pred_length_idx = 1 if split_train else 0
    nbeats_hp = NBEATSPrimitive.metadata.query(
    )['primitive_code']['class_type_arguments']['Hyperparams']
    nbeats = NBEATSPrimitive(hyperparams=nbeats_hp(
        nbeats_hp.defaults(),
        epochs=1,
        steps_per_epoch=1,
        num_estimators=1,
        prediction_length=min_pred_lengths[dataset_name][pred_length_idx] + 5,
        #quantiles = (0.1, 0.9),
    ))
    if os.path.isdir(nbeats.hyperparams['weights_dir']):
        shutil.rmtree(nbeats.hyperparams['weights_dir'])

    if split_train:
        nbeats.set_training_data(inputs=train_inputs, outputs=train_outputs)
    else:
        nbeats.set_training_data(inputs=all_inputs, outputs=all_outputs)

    if group_compose:
        assert nbeats._grouping_columns == [train_inputs.shape[1] - 1]
    else:
        assert grouping_cols[dataset_name] == nbeats._grouping_columns
    assert freqs[dataset_name] == nbeats._freq
    nbeats.fit()
    return nbeats, preprocess, train_inputs, val_inputs, all_inputs
Пример #5
0
    def test_produce_no_fit(self) -> None:
        dataset = test_utils.load_dataset(self._dataset_path)
        dataframe = test_utils.get_dataframe(dataset, "learningData")
        dataframe.drop(columns=["delta", "echo"], inplace=True)

        hyperparams_class = RankedLinearSVCPrimitive.metadata.query(
        )["primitive_code"]["class_type_arguments"]["Hyperparams"]
        hyperparams = hyperparams_class.defaults()

        ranked_lsvc = RankedLinearSVCPrimitive(hyperparams=hyperparams)
        ranked_lsvc.set_training_data(
            inputs=dataframe[["alpha", "bravo"]],
            outputs=pd.DataFrame({"charlie":
                                  dataframe["charlie"].astype(int)}),
        )
        results = ranked_lsvc.produce(
            inputs=dataframe[["alpha", "bravo"]]).value
        expected_labels = [1, 1, 1, 0, 0, 0, 0, 0, 0]
        expected_rank = [8, 8, 8, 5, 5, 5, 2, 2, 2]
        expected_confidence = [
            0.729,
            0.729,
            0.729,
            0.268,
            0.268,
            0.268,
            0.051,
            0.051,
            0.051,
        ]
        self.assertListEqual(list(results["charlie"]), expected_labels)
        self.assertListEqual(list(results["rank"]), expected_rank)
        np.testing.assert_almost_equal(list(results["confidence"]),
                                       expected_confidence,
                                       decimal=3)

        self.assertListEqual(
            results.metadata.list_columns_with_semantic_types(
                ("https://metadata.datadrivendiscovery.org/types/Score", )),
            [1],
        )
        self.assertListEqual(
            results.metadata.list_columns_with_semantic_types((
                "https://metadata.datadrivendiscovery.org/types/PredictedTarget",
            )),
            [0, 1, 2],
        ),
        self.assertListEqual(
            results.metadata.list_columns_with_semantic_types(
                ("https://metadata.datadrivendiscovery.org/types/Rank", )),
            [2],
        )
Пример #6
0
def generate_dataset(inputFile, outputFile, n):
    """
    Generates a random tab delimitered .txt file with column names as in the input file and random values from
    the possible values of each column in the input file.

    :param inputFile:
    :param outputFile:
    :param n: rows in output file
    """
    column_name_values = get_dataframe(inputFile)

    data_set = generate_randomDF(column_name_values, n)
    data_set.to_csv("Data/" + outputFile, sep="\t", index_label="foundid")
Пример #7
0
    def test_defaults(self) -> None:
        # load test data into a dataframe
        dataset = test_utils.load_dataset(self._dataset_path)
        dataframe = test_utils.get_dataframe(dataset, "learningData")

        # create the imputer
        hyperparams_class = ReplaceSingletonsPrimitive.metadata.query(
        )["primitive_code"]["class_type_arguments"]["Hyperparams"]
        replacer = ReplaceSingletonsPrimitive(
            hyperparams=hyperparams_class.defaults())

        result = replacer.produce(inputs=dataframe).value
        self.assertEqual(result["alpha"].iloc[4], utils.SINGLETON_INDICATOR)
Пример #8
0
def _test_ts(dataset_name, target_col, group_compose=False, split_train=False):
    nbeats, preprocess, inputs_train, inputs_val, inputs_all = _test_set_training_data(
        dataset_name, target_col, group_compose=group_compose, split_train=split_train
    )
    # _test_produce_train_data(nbeats, inputs_train, inputs_val, inputs_all)

    dataset = test_utils.load_dataset(
        f"/datasets/seed_datasets_current/{dataset_name}/TEST/dataset_TEST/"
    )
    df = test_utils.get_dataframe(dataset, "learningData", target_col)
    inputs_test, _ = preprocess.produce(df)

    _test_produce_test_data(nbeats, inputs_test)
    def test_no_missing(self) -> None:
        # load test data into a dataframe
        dataset = test_utils.load_dataset(self._dataset_path)
        dataframe = test_utils.get_dataframe(dataset, "learningData")

        # create the imputer
        hyperparams_class = CategoricalImputerPrimitive.metadata.query(
        )["primitive_code"]["class_type_arguments"]["Hyperparams"]
        imputer = CategoricalImputerPrimitive(
            hyperparams=hyperparams_class.defaults().replace({
                "strategy": "most_frequent",
                "use_columns": [3]
            }))

        result = imputer.produce(inputs=dataframe).value
        self.assertEqual(result["charlie"].iloc[2], "whiskey")
Пример #10
0
    def test_defaults(self) -> None:
        # load test data into a dataframe
        dataset = test_utils.load_dataset(self._dataset_path)
        dataframe = test_utils.get_dataframe(dataset, "learningData")
        dataframe = ListEncoderPrimitiveTestCase._convert_lists(dataframe)

        # create the imputer
        hyperparams_class = ListEncoderPrimitive.metadata.query()["primitive_code"][
            "class_type_arguments"
        ]["Hyperparams"]
        encoder = ListEncoderPrimitive(hyperparams=hyperparams_class.defaults())

        encoder.set_training_data(inputs=dataframe)
        encoder.fit()
        result = encoder.produce(inputs=dataframe).value
        self._assert_result(result)
Пример #11
0
def _test_ts(dataset_name, target_col, group_compose=False, split_train=False):
    deepar, preprocess, inputs_train, inputs_val, inputs_all = _test_set_training_data(
        dataset_name,
        target_col,
        group_compose=group_compose,
        split_train=split_train)
    _test_produce_train_data(deepar, inputs_train, inputs_val, inputs_all)

    dataset = test_utils.load_dataset(
        f'/datasets/seed_datasets_current/{dataset_name}/TEST/dataset_TEST/')
    df = test_utils.get_dataframe(dataset, 'learningData', target_col)
    inputs_test, _ = preprocess.produce(df)

    _test_produce_test_data(deepar, inputs_test)
    _test_produce_confidence_intervals(deepar, inputs_all)
    _test_produce_confidence_intervals(deepar, inputs_test)
    def test_basic(self) -> None:
        dataset = test_utils.load_dataset(self._dataset_path)
        dataframe = test_utils.get_dataframe(dataset, "learningData")
        dataframe.drop(columns=["delta", "echo"], inplace=True)

        hyperparams_class = IsolationForestPrimitive.metadata.query(
        )["primitive_code"]["class_type_arguments"]["Hyperparams"]
        hyperparams = hyperparams_class.defaults().replace({"n_jobs": -1})

        isp = IsolationForestPrimitive(hyperparams=hyperparams)
        isp.set_training_data(inputs=dataframe[["alpha", "bravo"]], )
        isp.fit()
        results = isp.produce(inputs=dataframe[["alpha", "bravo"]]).value

        self.assertListEqual(list(results["outlier_label"]),
                             [-1, -1, -1, -1, -1, -1, -1, -1, -1])
    def test_band_mapping_replace(self) -> None:
        dataset = test_utils.load_dataset(self._dataset_path)
        dataset.metadata = dataset.metadata.add_semantic_type(
            ("learningData", metadata_base.ALL_ELEMENTS, 2),
            "https://metadata.datadrivendiscovery.org/types/GroupingKey",
        )
        dataset.metadata = dataset.metadata.add_semantic_type(
            ("learningData", metadata_base.ALL_ELEMENTS, 1),
            "https://metadata.datadrivendiscovery.org/types/FileName",
        )
        dataset.metadata = dataset.metadata.add_semantic_type(
            ("learningData", metadata_base.ALL_ELEMENTS, 5),
            "https://metadata.datadrivendiscovery.org/types/FloatVector",
        )
        dataset.metadata = dataset.metadata.update(
            ("0", ), {"location_base_uris": self._media_path})
        dataset.metadata = dataset.metadata.update(
            ("learningData", metadata_base.ALL_ELEMENTS, 1),
            {"location_base_uris": [self._media_path]},
        )
        dataframe = test_utils.get_dataframe(dataset, "learningData")

        hyperparams_class = DataFrameSatelliteImageLoaderPrimitive.metadata.query(
        )["primitive_code"]["class_type_arguments"]["Hyperparams"]
        hyperparams = hyperparams_class.defaults().replace({
            "return_result": "replace",
            "n_jobs": -1
        })
        loader = DataFrameSatelliteImageLoaderPrimitive(
            hyperparams=hyperparams)
        result_dataframe = loader.produce(inputs=dataframe).value

        # verify the output
        self.assertListEqual(list(result_dataframe.shape), [2, 7])
        self.assertListEqual(list(result_dataframe["image_file"][0].shape),
                             [12, 120, 120])
        self.assertEqual(result_dataframe["d3mIndex"][0], "1")
        self.assertEqual(result_dataframe["group_id"][0],
                         "S2A_MSIL2A_20170613T101031_0_49")
        self.assertEqual(result_dataframe["d3mIndex"][1], "2")
        self.assertEqual(result_dataframe["group_id"][1], "2")
        self.assertEqual(
            result_dataframe.metadata.list_columns_with_semantic_types((
                "https://metadata.datadrivendiscovery.org/types/LocationPolygon",
            )),
            [5],
        )
    def test_defaults(self) -> None:
        # load test data into a dataframe
        dataset = test_utils.load_dataset(self._dataset_path)
        dataframe = test_utils.get_dataframe(dataset, "learningData")

        # create the imputer
        hyperparams_class = CategoricalImputerPrimitive.metadata.query(
        )["primitive_code"]["class_type_arguments"]["Hyperparams"]
        imputer = CategoricalImputerPrimitive(
            hyperparams=hyperparams_class.defaults())

        result = imputer.produce(inputs=dataframe).value
        self.assertEqual(result["alpha"].iloc[2], "whiskey")
        self.assertEqual(result["bravo"].iloc[2], "whiskey")
        self.assertEqual(result["charlie"].iloc[2], "whiskey")
        self.assertEqual(result["delta"].iloc[2],
                         utils.MISSING_VALUE_INDICATOR)
    def test_classification_singleton_label(self) -> None:
        # load test data into a dataframe
        dataset = test_utils.load_dataset(self._dataset_path)
        dataframe = test_utils.get_dataframe(dataset, "learningData")
        dataframe = dataframe.iloc[0:6]

        # create the encoder
        hyperparams_class = TextEncoderPrimitive.metadata.query()["primitive_code"][
            "class_type_arguments"
        ]["Hyperparams"]
        encoder = TextEncoderPrimitive(hyperparams=hyperparams_class.defaults())
        encoder.set_training_data(
            inputs=dataframe.iloc[:, [0, 1]], outputs=dataframe[['bravo']]
        )

        # should fail in this case because we have a label with a cardinality of 1
        self.assertRaises(ValueError, encoder.fit)
Пример #16
0
def load_entity_table(label):
    # '''
    # 	Description: Allows the user to analyze the individual words within each category of entity.
    # 	Params: Selected category of entity from dropdown.
    # 	Returns: HTML table of entities within the entity category.
    # '''

    global interface_obj
    df = utils.get_dataframe(label, interface_obj.entity_dic)
    columns = df.columns
    return html.Table(
        [html.Tr([html.Th(label)])] +
        [html.Tr([html.Th(col) for col in columns])] + [
            html.Tr([html.Td(df.iloc[i][col]) for col in columns])
            for i in range(len(df))
        ],
        id='entity-table')
 def test_basic(self) -> None:
     dataset = test_utils.load_dataset(self._tabular_dataset_path)
     df = test_utils.get_dataframe(dataset, "learningData")
     df.metadata = df.metadata.add_semantic_type(
         (metadata_base.ALL_ELEMENTS, 1), "http://schema.org/Integer"
     )
     df.metadata = df.metadata.add_semantic_type(
         (metadata_base.ALL_ELEMENTS, 2), "http://schema.org/Float"
     )
     hyperparams_class = ColumnParserPrimitive.metadata.get_hyperparams()
     cpp = ColumnParserPrimitive(hyperparams=hyperparams_class.defaults())
     result_df = cpp.produce(inputs=df).value
     self.assertEqual(result_df["d3mIndex"].dtype, np.dtype("int64"))
     self.assertEqual(result_df["alpha"].dtype, np.dtype("int64"))
     self.assertEqual(result_df["bravo"].dtype, np.dtype("float64"))
     self.assertEqual(result_df["charlie"].dtype, np.dtype("int64"))
     self.assertEqual(result_df["delta"].dtype, np.dtype("object"))
     self.assertEqual(result_df["echo"].dtype, np.dtype("float64"))
Пример #18
0
def get_model_cds_X_test(split, SI, param=Parameters.standard):
    """
    The days before this date are training set
    The days on and after this date are the testing set
    :param split:
    :param SI:
    :return:
    """
    df = get_dataframe(SI)
    cds = CDS(df.index, df.CLOSE, df.TURN, SI)

    # Load from pickle instead of retraining
    model, X_test, y_test = prepare_model(cds,
                                          split_date=split,
                                          load_from_disk=False,
                                          save_to_disk=False,
                                          evaluate=True,
                                          **param)

    return model, cds, X_test
    def test_single_row(self) -> None:
        # load test data into a dataframe
        dataset = test_utils.load_dataset(self._dataset_path)
        dataframe = test_utils.get_dataframe(dataset, "learningData")

        # create the encoder
        hyperparams_class = BinaryEncoderPrimitive.metadata.query(
        )["primitive_code"]["class_type_arguments"]["Hyperparams"]
        encoder = BinaryEncoderPrimitive(hyperparams=hyperparams_class.
                                         defaults().replace({"min_binary": 3}))

        encoder.set_training_data(inputs=dataframe)
        encoder.fit()
        result = encoder.produce(inputs=dataframe.head(1)).value
        self.assertEqual(len(result.index), 1)
        self.assertEqual(
            result.metadata.list_columns_with_semantic_types((
                "https://metadata.datadrivendiscovery.org/types/Attribute", )),
            [1, 2, 3, 4, 5, 6, 7],
        )
Пример #20
0
    def test_normalized(self) -> None:
        dataset = test_utils.load_dataset(self._dataset_path)
        dataframe = test_utils.get_dataframe(dataset, "learningData")
        dataframe.drop(columns=["delta", "echo"], inplace=True)

        hyperparams_class = RankedLinearSVCPrimitive.metadata.query(
        )["primitive_code"]["class_type_arguments"]["Hyperparams"]
        hyperparams = hyperparams_class.defaults().replace(
            {"scaling": "standardize"})

        ranked_lsvc = RankedLinearSVCPrimitive(hyperparams=hyperparams)
        # this is here because CalibratedClassifierCV fails if predicted labels does not contain at least
        # one of all possible labels
        dataframe["charlie"][1] = 1.0
        dataframe["charlie"][8] = 1.0
        ranked_lsvc.set_training_data(
            inputs=dataframe[["alpha", "bravo"]],
            outputs=pd.DataFrame({"charlie":
                                  dataframe["charlie"].astype(int)}),
        )
        ranked_lsvc.fit()
        results = ranked_lsvc.produce(
            inputs=dataframe[["alpha", "bravo"]]).value
        expected_labels = [1, 1, 1, 0, 0, 0, 1, 1, 1]
        expected_confidence = [
            0.807,
            0.807,
            0.807,
            0.218,
            0.218,
            0.218,
            0.923,
            0.923,
            0.923,
        ]
        expected_rank = [5, 5, 5, 2, 2, 2, 8, 8, 8]
        self.assertListEqual(list(results["charlie"]), expected_labels)
        np.testing.assert_almost_equal(list(results["confidence"]),
                                       expected_confidence,
                                       decimal=3)
        self.assertListEqual(list(results["rank"]), expected_rank)
    def _load_data(self) -> None:
        dataset = test_utils.load_dataset(self._dataset_path)
        dataframe = test_utils.get_dataframe(dataset, "learningData")

        dataframe.metadata = dataframe.metadata.add_semantic_type(
            (metadata_base.ALL_ELEMENTS, 2),
            "https://metadata.datadrivendiscovery.org/types/FloatVector",
        )

        hyperparam_class = ColumnParserPrimitive.metadata.query(
        )["primitive_code"]["class_type_arguments"]["Hyperparams"]
        cpp = ColumnParserPrimitive(hyperparams=hyperparam_class.defaults(
        ).replace({
            "parsing_semantics": (
                "http://schema.org/Boolean",
                "http://schema.org/Integer",
                "http://schema.org/Float",
                "https://metadata.datadrivendiscovery.org/types/FloatVector",
            )
        }))
        return cpp.produce(inputs=dataframe).value
    def test_constant(self) -> None:
        # load test data into a dataframe
        dataset = test_utils.load_dataset(self._dataset_path)
        dataframe = test_utils.get_dataframe(dataset, "learningData")

        # create the imputer
        hyperparams_class = CategoricalImputerPrimitive.metadata.query(
        )["primitive_code"]["class_type_arguments"]["Hyperparams"]
        imputer = CategoricalImputerPrimitive(
            hyperparams=hyperparams_class.defaults().replace(
                {
                    "strategy": "constant",
                    "fill_value": "empty",
                    "use_columns": [1, 2, 3, 4],
                }))

        result = imputer.produce(inputs=dataframe).value
        self.assertEqual(result["alpha"].iloc[2], "empty")
        self.assertEqual(result["bravo"].iloc[2], "empty")
        self.assertEqual(result["charlie"].iloc[2], "whiskey")
        self.assertEqual(result["delta"].iloc[2], "empty")
Пример #23
0
    def test_defaults_produce(self) -> None:
        # load test data into a dataframe
        dataset = test_utils.load_dataset(self._dataset_path)
        dataframe = test_utils.get_dataframe(dataset, "learningData")

        # mark grouping key
        dataframe.metadata = dataframe.metadata.add_semantic_type(
            (metadata_base.ALL_ELEMENTS, 1),
            "https://metadata.datadrivendiscovery.org/types/GroupingKey",
        )

        # create the clustering primitive and cluster
        hyperparams_class = Sloth.metadata.query()["primitive_code"][
            "class_type_arguments"
        ]["Hyperparams"]
        sloth = Sloth(hyperparams=hyperparams_class.defaults())
        result = sloth.produce(inputs=dataframe).value

        # check if the first four columns match the original
        pd.testing.assert_frame_equal(result.iloc[:, :-1], dataframe)

        # check that the first two keys are in their own cluster, and the the last two are in
        # the same cluster
        clusters_by_key = result[["key", "__cluster"]].drop_duplicates()
        self.assertNotEquals(clusters_by_key.iloc[0, 1], clusters_by_key.iloc[1, 1])
        self.assertNotEquals(clusters_by_key.iloc[1, 1], clusters_by_key.iloc[2, 1])
        self.assertEquals(clusters_by_key.iloc[2, 1], clusters_by_key.iloc[3, 1])

        # check metadata is correct for new column
        column_metadata = result.metadata.query_column(4)
        self.assertListEqual(
            list(column_metadata["semantic_types"]),
            [
                "https://metadata.datadrivendiscovery.org/types/Attribute",
                "https://metadata.datadrivendiscovery.org/types/ConstructedAttribute",
                "http://schema.org/Integer",
            ],
        ),
        self.assertEqual(column_metadata["structural_type"], np.int64)
Пример #24
0
    def test_buildFromCode(self):
        deck = get_dataframe()

        testDeck = deck['cardCode'].value_counts().to_dict()
        f = lambda key, value: str(value) + ':' + key

        cardList = []
        for key, value in zip(testDeck.keys(), testDeck.values()):
            cardList.append(f(key, value))

        testDeck = LoRDeck(cardList)
        code = testDeck.encode()
        testDataframe = buildFromCode(code)

        valid = deck['cardCode'].unique().tolist()
        test = testDataframe['cardCode'].unique().tolist()

        valid.sort()
        test.sort()

        for x, y in zip(valid, test):
            self.assertEqual(x, y)
Пример #25
0
 def _load_data(
     self,
     dataframe_name,
     date_time_index=None,
     value_indices=[],
     parsing_hyperparams=None,
 ):
     dataset = test_utils.load_dataset(self._dataset_path)
     timeseries_df = test_utils.get_dataframe(dataset, dataframe_name)
     self._load_semantics_into_data(
         timeseries_df,
         group_index=1,
         date_time_index=date_time_index,
         value_indices=value_indices,
     )
     hyperparams_class = ColumnParserPrimitive.metadata.get_hyperparams()
     if parsing_hyperparams:
         cpp = ColumnParserPrimitive(hyperparams=hyperparams_class.defaults(
         ).replace(parsing_hyperparams))
     else:
         cpp = ColumnParserPrimitive(
             hyperparams=hyperparams_class.defaults())
     return cpp.produce(inputs=timeseries_df).value
Пример #26
0
def test_matching():
    input_file = "Constants/ColumnsAndValuesData.txt"
    column_name_values = get_dataframe(input_file)

    data_set = generate_randomDF(column_name_values, 100)
    column_names = list(data_set.columns)
    x_column_names = column_names
    x_column_names.remove("foundid")
    x_column_names.insert(0, "lostid")

    x_values = list(dict(data_set.iloc[0]).values())
    x_dict = {x_column_names[i]: x_values[i] for i in range(len(x_column_names))}

    x = pd.DataFrame(x_dict, index=[0])
    
    x_dataset=Dataset('lost','single',x)
    y_dataset=Dataset('found','multiple',data_set)
    
    print(data_set.head())
    print(x.head())
    print(type(x))

    Matching.do_matching(x_dataset, y_dataset, 5)
    def test_get_set_params(self) -> None:
        # load test data into a dataframe
        dataset = test_utils.load_dataset(self._dataset_path)
        dataframe = test_utils.get_dataframe(dataset, "learningData")

        # create the imputer
        hyperparams_class = BinaryEncoderPrimitive.metadata.query(
        )["primitive_code"]["class_type_arguments"]["Hyperparams"]
        encoder = BinaryEncoderPrimitive(hyperparams=hyperparams_class.
                                         defaults().replace({"min_binary": 3}))
        encoder.set_training_data(inputs=dataframe)
        encoder.fit()

        hyperparams = encoder.hyperparams
        params = encoder.get_params()
        encoder = BinaryEncoderPrimitive(hyperparams=hyperparams)
        encoder.set_params(params=params)

        result = encoder.produce(inputs=dataframe).value

        self.assertEqual(len(result.index), 5)
        self.assertSequenceEqual(
            list(result.columns),
            [
                "d3mIndex",
                "charlie",
                "delta",
                "__binary_0",
                "__binary_1",
                "__binary_2",
                "__binary_3",
                "__binary_4",
            ],
        )
        self.assertSequenceEqual(
            result.dtypes.tolist(),
            [object, object, object, int, int, int, int, int])
 def test_datetime(self) -> None:
     dataset = test_utils.load_dataset(self._dataset_path)
     df = test_utils.get_dataframe(dataset, "0")
     df.metadata = df.metadata.add_semantic_type(
         (metadata_base.ALL_ELEMENTS, 4), "http://schema.org/DateTime"
     )
     hyperparams_class = ColumnParserPrimitive.metadata.get_hyperparams()
     cpp = ColumnParserPrimitive(
         hyperparams=hyperparams_class.defaults().replace(
             {
                 "parsing_semantics": [
                     "http://schema.org/DateTime",
                 ]
             }
         )
     )
     result_df = cpp.produce(inputs=df).value
     self.assertListEqual(
         list(result_df["sierra"]),
         [
             common_utils.parse_datetime_to_float(date, fuzzy=True)
             for date in df["sierra"]
         ],
     )
Пример #29
0
    def test_basic(self) -> None:
        dataset = test_utils.load_dataset(self._dataset_path)
        dataframe = test_utils.get_dataframe(dataset, "learningData")
        dataframe.drop(columns=["delta", "echo"], inplace=True)

        hyperparams_class = RankedLinearSVCPrimitive.metadata.query(
        )["primitive_code"]["class_type_arguments"]["Hyperparams"]
        hyperparams = hyperparams_class.defaults()

        ranked_lsvc = RankedLinearSVCPrimitive(hyperparams=hyperparams)
        ranked_lsvc.set_training_data(
            inputs=dataframe[["alpha", "bravo"]],
            outputs=pd.DataFrame({"charlie":
                                  dataframe["charlie"].astype(int)}),
        )
        ranked_lsvc.fit()
        results = ranked_lsvc.produce(
            inputs=dataframe[["alpha", "bravo"]]).value
        expected_labels = [1, 1, 1, 0, 0, 0, 0, 0, 0]
        expected_confidence = [
            0.73,
            0.73,
            0.73,
            0.269,
            0.269,
            0.269,
            0.052,
            0.052,
            0.052,
        ]
        expected_rank = [8, 8, 8, 5, 5, 5, 2, 2, 2]
        self.assertListEqual(list(results["charlie"]), expected_labels)
        np.testing.assert_almost_equal(list(results["confidence"]),
                                       expected_confidence,
                                       decimal=3)
        self.assertListEqual(list(results["rank"]), expected_rank)
    def test_defaults(self) -> None:
        # load test data into a dataframe
        dataset = test_utils.load_dataset(self._dataset_path)
        dataframe = test_utils.get_dataframe(dataset, "learningData")

        # create the imputer
        hyperparams_class = OneHotEncoderPrimitive.metadata.query(
        )["primitive_code"]["class_type_arguments"]["Hyperparams"]
        encoder = OneHotEncoderPrimitive(
            hyperparams=hyperparams_class.defaults())

        encoder.set_training_data(inputs=dataframe)
        encoder.fit()
        result = encoder.produce(inputs=dataframe).value
        self.assertEqual(len(result.index), 5)
        self.assertEqual(
            result.metadata.list_columns_with_semantic_types((
                "https://metadata.datadrivendiscovery.org/types/Attribute", )),
            [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
        )
        self.assertSequenceEqual(list(result.columns), ["d3mIndex"] +
                                 [f"__onehot_{i}" for i in range(10)])
        self.assertSequenceEqual(result.dtypes.tolist(),
                                 [object] + [float for i in range(10)])