示例#1
0
    def test_none_cancor(self):
        X, y = load_xy(0)

        feats = [
            "w_lambda",
            "p_trace",
            "lh_trace",
            "roy_root",
        ]

        mfe = MFE(groups=[GNAME], features=feats)

        custom_args = {
            "can_cors": np.array([]),
            "can_cor_eigvals": np.array([]),
        }

        mfe.fit(X.values, y.values, precomp_groups=None)

        extract_args = {cur_feat: custom_args for cur_feat in feats}
        vals = mfe.extract(**extract_args, suppress_warnings=True)[1]

        assert np.allclose(vals,
                           np.full(shape=len(vals), fill_value=np.nan),
                           equal_nan=True)
示例#2
0
    def test_ft_methods_model_based_02(self, dt_id, ft_name, exp_value,
                                       precompute):
        """Function to test each meta-feature belongs to model-based group."""
        precomp_group = GNAME if precompute else None

        X, y = load_xy(dt_id)
        mfe = MFE(
            groups=[GNAME],
            features=[ft_name],
            hypparam_model_dt={
                "max_depth": 5,
                "min_samples_split": 10,
                "criterion": "entropy",
            },
            random_state=1234,
        )

        mfe.fit(X.values, y.values, precomp_groups=precomp_group)

        if precomp_group is None:
            # Note: the precomputation of 'model-based' group is always
            # forced due to the need of the 'dt_model' value
            mfe._precomp_args_ft = {
                "dt_model": mfe._precomp_args_ft.get("dt_model")
            }

        value = mfe.extract()[1]

        if exp_value is np.nan:
            assert value[0] is exp_value

        else:
            assert np.allclose(value, exp_value)
示例#3
0
class MetaFeatures:
    def __init__(self):
        self.mfe = MFE()
        self.le = preprocessing.LabelEncoder()

    def calculate(self, dataset_filename):
        # Reading dataset
        dataset = Dataset.get_or_insert(dataset_filename)
        if dataset.name.endswith("json"):
            data = pd.read_json(self.datasets_dir + dataset.name)
        elif dataset.name.endswith("arff"):
            data = arff_io.loadarff(self.datasets_dir + dataset.name)
            data = pd.DataFrame(data[0])
        # Getting target column
        target = data["class"].values
        # Separating from data from labels
        values = data.drop("class", axis=1).values
        ft = self.metafeatures(values, target)
        # Getting metafeatures names (labels) and the calculated values (results)
        labels = np.array(ft[0])
        results = np.array(ft[1])
        # Ignoring nan values (Removing columns - features - with nan values in datasets)
        nan_columns = np.isnan(results)
        not_nan = np.invert(nan_columns)
        labels = labels[not_nan].tolist()
        results = results[not_nan].tolist()
        # Sometimes the result is a complex number, use just the real part
        for indx, result in enumerate(results):
            if isinstance(result, complex):
                results[indx] = result.real
        metadata = Metadata(dataset=dataset.name,
                            features=labels,
                            values=results).save()
        return (labels, results)

    def metafeatures(self, values, target):
        # Dealing with object columns (non numeric)
        if target.dtype == np.object:
            self.le.fit(target)
            target = self.le.transform(target)
        # Calculating metafeatures
        self.mfe.fit(values, target)
        try:
            ft = self.mfe.extract()
        except AttributeError:
            self.mfe.fit(values.astype(float), target)
            ft = self.mfe.extract()
        return ft

    def apply(self, datasets_fd="mock_datasets/"):
        # Calculates metafeatures for every datasets in the datasets directory
        self.datasets_dir = datasets_fd
        # Getting list of datasets inside directory
        self.datasets = [
            f for f in listdir(self.datasets_dir)
            if (isfile(join(self.datasets_dir, f)) and (
                f.endswith("json") or f.endswith("arff")))
        ]
        for dataset in self.datasets:
            self.calculate(dataset)
示例#4
0
def extract_from_object(dataset: Union[np.ndarray, list], mfe_params: dict = None) -> Sequence:
    if mfe_params is None or len(mfe_params) == 0:
        mfe_params = __default_mfe_params

    mfe = MFE(**mfe_params)
    mfe.fit(dataset, suppress_warnings=True)
    return mfe.extract(suppress_warnings=True)[1]
示例#5
0
    def test_one_hot_encoding_02(self):
        X, y = utils.load_xy(1)
        mfe = MFE()
        mfe.fit(X.values, y.values, transform_cat="one-hot-full")

        exp_value = np.sum([np.unique(attr).size for attr in X.values.T])

        assert mfe._custom_args_ft["N"].shape[1] == exp_value
示例#6
0
    def test_one_hot_encoding_03(self):
        X, y = utils.load_xy(2)
        mfe = MFE()
        mfe.fit(X.values, y.values, transform_cat="one-hot")

        exp_value = X.values.shape[1]

        assert mfe._custom_args_ft["N"].shape[1] == exp_value
示例#7
0
def main():
    """Extract meta-features with pyMFE and evaluate MSE with LightGBM.
    """
    args = parse_args()
    wandb.init(project='DeepMetaLearning', name='classical', config=args)
    warnings.filterwarnings("ignore", category=RuntimeWarning)
    warnings.filterwarnings("ignore", category=UserWarning)
    mfe = MFE(random_state=args.seed)
    print("Extracting meta-features for train files")
    train_df = []
    train_path = pathlib.Path(args.data_path) / 'train'
    train_files = list(train_path.glob('*.parquet'))
    scores_data = pd.read_csv("augment_data.csv", index_col="filename")
    for fname in tqdm(train_files):
        df = pd.read_parquet(fname)
        X = df.drop(columns=["class"]).values
        # First evaluate only unsupervised features
        #y = df["class"].values
        mfe.fit(X)
        ft = mfe.extract()
        ft = dict(zip(*ft))
        ft["best_clf"] = scores_data.loc[fname.name].argmax()
        train_df.append(ft)

    print("Extracting meta-features for validation files")
    valid_df = []
    valid_path = pathlib.Path(args.data_path) / 'valid'
    valid_files = list(valid_path.glob('*.parquet'))
    for fname in tqdm(valid_files):
        df = pd.read_parquet(fname)
        X = df.drop(columns=["class"]).values
        # First evaluate only unsupervised features
        #y = df["class"].values
        mfe.fit(X)
        ft = mfe.extract()
        ft = dict(zip(*ft))
        ft["best_clf"] = scores_data.loc[fname.name].argmax()
        valid_df.append(ft)

    train_df = pd.DataFrame(train_df)
    valid_df = pd.DataFrame(valid_df)
    if args.save_mfe:
        train_df.to_csv("mfe.train.csv", index=False)
        train_df.to_csv("mfe.test.csv", index=False)

    drop_columns = ["best_clf"]
    xtrain = train_df.drop(columns=drop_columns).values
    xtest = valid_df.drop(columns=drop_columns).values
    ytrain = train_df[drop_columns]
    ytrue = valid_df[drop_columns]
    lg = LGBMClassifier(random_state=args.seed, objective='multiclass')
    lg.fit(xtrain, ytrain)
    yhat = lg.predict(xtest)

    recall = metrics.recall_score(ytrue, yhat, average="micro")
    precis = metrics.precision_score(ytrue, yhat, average="micro")
    wandb.log({"recall": recall})
    wandb.log({"precision": precis})
示例#8
0
 def _get_feats(cls):
     from sklearn.datasets import load_iris
     from pymfe.mfe import MFE
     data = load_iris()
     mfe = MFE()
     mfe.fit(data.data, data.target)
     ft = mfe.extract()
     _feats = [feature.replace(".", "_") for feature in ft[0]]
     return _feats
示例#9
0
    def test_one_hot_encoding_04(self):
        X, y = utils.load_xy(2)
        mfe = MFE()

        X = np.hstack((X.values, np.ones((y.size, 1), dtype=str)))
        y = y.values

        with pytest.raises(ValueError):
            mfe.fit(X=X, y=y, transform_cat="one-hot")
示例#10
0
 def transform(self, X, y):
     if isinstance(X, pd.DataFrame):
         X = X.to_numpy(dtype='int8')
     if isinstance(y, pd.Series):
         y = y.to_numpy(dtype='int32')
     mfe = MFE(groups=["general"],
               summary=['kurtosis', 'min', 'max', 'median', 'skewness'])
     mfe.fit(X, y)
     ft = mfe.extract()[1]
     return np.nan_to_num(np.array(ft), 0)
示例#11
0
    def test_gray_encoding_missing_value(self):
        X, y = utils.load_xy(1)
        mfe = MFE()

        X = np.copy(X.values)
        y = y.values

        X[5, 0] = np.nan

        with pytest.raises(ValueError):
            mfe.fit(X, y, transform_cat="gray")
示例#12
0
    def test_integration_complexity(self, dt_id, exp_value, precompute):
        """Function to test each meta-feature belongs to complexity group."""
        precomp_group = GNAME if precompute else None

        X, y = load_xy(dt_id)
        mfe = MFE(groups=[GNAME], summary="mean", random_state=1234)

        mfe.fit(X.values, y.values, precomp_groups=precomp_group)

        value = mfe.extract()[1]

        assert np.allclose(value, exp_value, equal_nan=True, rtol=0.025)
示例#13
0
    def test_integration_model_based(self, dt_id, exp_value, precompute):
        """Function to test all model-based meta-features."""
        precomp_group = GNAME if precompute else None

        X, y = load_xy(dt_id)
        mfe = MFE(groups=[GNAME], summary="mean", random_state=1234)

        mfe.fit(X.values, y.values, precomp_groups=precomp_group)

        value = mfe.extract()[1]

        assert np.allclose(value, exp_value, equal_nan=True)
示例#14
0
    def test_extract_from_model(self):
        X, y = utils.load_xy(2)

        model = sklearn.tree.DecisionTreeClassifier(random_state=1234).fit(
            X.values, y.values)

        mtf_name, mtf_vals = MFE(random_state=1234).extract_from_model(model)

        extractor = MFE(groups="model-based", random_state=1234)
        extractor.fit(X=X.values, y=y.values, transform_num=False)
        mtf_name2, mtf_vals2 = extractor.extract()

        assert np.all(mtf_name == mtf_name2) and np.allclose(
            mtf_vals, mtf_vals2)
示例#15
0
    def test_ft_methods_itemset(self, dt_id, ft_name, exp_value, precompute):
        """Function to test each meta-feature belongs to itemset group."""
        precomp_group = GNAME if precompute else None

        X, y = load_xy(dt_id)
        mfe = MFE(groups=[GNAME], features=[ft_name], random_state=1234)

        mfe.fit(X.values, y.values, precomp_groups=precomp_group)

        value = mfe.extract()[1]

        if exp_value is np.nan:
            assert value[0] is exp_value
        else:
            assert np.allclose(value, exp_value, equal_nan=True)
示例#16
0
    def test_parse_valid_metafeatures(self, groups):
        """Check the length of valid metafeatures per group."""
        X, y = utils.load_xy(0)

        mfe = MFE(
            groups="all", summary=None, lm_sample_frac=0.5, random_state=1234
        )

        mfe.fit(X.values, y.values)

        res = mfe.extract()

        target_mtf = mfe.valid_metafeatures(groups=groups)
        names, _ = mfe.parse_by_group(groups, res)

        assert not set(names).symmetric_difference(target_mtf)
def single_group_meta_features(X):

    # Extract single group (source/target) features

    features = [
        "cohesiveness", "cor", "cov", "eigenvalues", "nr_cor_attr", "min",
        "mean", "median", "max", "iq_range", "kurtosis", "skewness", "t_mean",
        "var", "sd", "range", "nr_norm", "nr_outliers"
    ]
    mfe = MFE(features=features, suppress_warnings=True)

    mfe.fit(X, [0] * X.shape[0])
    ft = mfe.extract()

    #return pd.Series(ft[1],index=ft[0])
    return ft[1]
示例#18
0
    def test_ft_method_relative(self, dt_id, summary, precompute, sample_size,
                                exp_value):
        """Test relative and subsampling relative landmarking."""
        precomp_group = "relative" if precompute else None

        X, y = load_xy(dt_id)
        mfe = MFE(groups=["relative"],
                  summary=summary,
                  sample_size=sample_size,
                  random_state=1234)

        mfe.fit(X.values, y.values, precomp_groups=precomp_group)

        _, vals = mfe.extract()

        assert np.allclose(vals, exp_value)
def bigroup_meta_features(source_pt_emb, target_pt_emb):

    y = [0] * source_pt_emb.shape[0] + [1] * source_pt_emb.shape[0]
    X = np.concatenate([source_pt_emb, target_pt_emb], axis=0)

    # Extract several meta-features (more than for single group)
    mfe = MFE(groups=["Statistical", "complexity", "concept", "clustering"],
              suppress_warnings=True)
    mfe.fit(X, y)
    ft = mfe.extract()

    feat_list = []

    interest_features = [
        'ch', 'cohesiveness.mean', 'cohesiveness.sd', 'conceptvar.mean',
        'conceptvar.sd', 'cor.mean', 'cor.sd', 'cov.mean', 'cov.sd',
        'eigenvalues.mean', 'eigenvalues.sd', 'f3.mean', 'f4.mean', 'gravity',
        'impconceptvar.mean', 'impconceptvar.sd', 'int', 'iq_range.mean',
        'iq_range.sd', 'kurtosis.mean', 'kurtosis.sd', 'mad.mean', 'mad.sd',
        'max.mean', 'max.sd', 'mean.mean', 'mean.sd', 'median.mean',
        'median.sd', 'min.mean', 'min.sd', 'nr_cor_attr', 'nr_norm',
        'nr_outliers', 'pb', 'range.mean', 'range.sd', 'sd.mean', 'sd.sd',
        'sil', 'skewness.mean', 'skewness.sd', 't4', 't_mean.mean',
        't_mean.sd', 'var.mean', 'var.sd', 'vdb', 'vdu', 'wg_dist.mean',
        'wg_dist.sd'
    ]

    for feat, val in zip(ft[0], ft[1]):

        if feat in interest_features:

            feat_list.append(val)

    #We add 3 extra "distances"

    hung_dist = permutation_dist(source_pt_emb, target_pt_emb)
    wass_dist = wasserstein_dist(source_pt_emb, target_pt_emb)
    hauss_dist = hausdorff_dist(source_pt_emb, target_pt_emb)

    feat_list.append(hung_dist)
    feat_list.append(wass_dist)
    feat_list.append(hauss_dist)

    #return pd.Series(feat_list,index=interest_features+["hung_dist","wasser","hauss"])
    return feat_list
示例#20
0
    def test_t1_arguments(self, orig_dist_mat_min, orig_dist_mat_ptp):
        exp_val = [0.015151516, 0.024628395]
        X, y = load_xy(2)

        extractor = MFE(groups="complexity", features="t1")
        extractor.fit(X.values, y.values, transform_num=False)

        args = {"t1": {}}

        if not orig_dist_mat_min:
            args["t1"].update({"orig_dist_mat_min": None})

        if not orig_dist_mat_ptp:
            args["t1"].update({"orig_dist_mat_ptp": None})

        _, res = extractor.extract(**args)

        assert np.allclose(res, exp_val)
示例#21
0
class MFE(Element):
    def cs_impl(self):
        raise Exception('Specify parameters like "supervised"/"unsupervised"'
                        'in the HP tree?')

    def build_impl(self):
        self.model = PYMFE()

    def apply_impl(self, data):
        return self.use_impl(data)

    def use_impl(self, data):
        self.model.fit(*data.Xy)
        names, values = self.model.extract(suppress_warnings=True)
        l = np.array(values)
        # TODO: suppressing NaNs with 0s!!
        l[~np.isfinite(l)] = 0
        return data.updated(self, l=l)
示例#22
0
    def test_extract_metafeature_names_unsupervised_01(self, groups, summary):
        """Test .extract_metafeature_names method."""
        X, _ = utils.load_xy(0)

        mfe = MFE(groups=groups, summary=summary)

        mtf_names_1 = mfe.extract_metafeature_names(supervised=False)
        mtf_names_2 = mfe.fit(X.values).extract(suppress_warnings=True)[0]

        assert mtf_names_1 == tuple(mtf_names_2)
示例#23
0
    def test_ft_methods_model_based(self, dt_id, ft_name, exp_value,
                                    precompute):
        """Function to test each meta-feature belongs to model_based group.
        """
        precomp_group = "model-based" if precompute else None

        X, y = load_xy(dt_id)
        mfe = MFE(groups=["model-based"],
                  features=[ft_name],
                  random_state=1234)

        mfe.fit(X.values, y.values, precomp_groups=precomp_group)

        value = mfe.extract()[1]

        if exp_value is np.nan:
            assert value[0] is exp_value

        else:
            assert np.allclose(value, exp_value)
示例#24
0
    def test_relative_correctness(self, summary, dt_id):
        """Test if the metafeatures postprocessed by rel. land. are correct."""
        X, y = load_xy(dt_id)
        mfe = MFE(groups="all",
                  summary=summary,
                  sample_size=0.5,
                  random_state=1234)

        mfe.fit(X.values, y.values)

        names, _ = mfe.extract()

        target_mtf = mfe.valid_metafeatures(groups="landmarking")

        relative_names = {
            name.split(".")[0]
            for name in names if name.rfind(".relative") != -1
        }

        assert not set(relative_names).symmetric_difference(target_mtf)
示例#25
0
    def test_ft_methods_landmarking(self, dt_id, ft_name, exp_value,
                                    precompute, sample_size):
        """Function to test each meta-feature belongs to landmarking group.
        """
        precomp_group = "landmarking" if precompute else None

        X, y = load_xy(dt_id)
        mfe = MFE(groups=["landmarking"],
                  features=[ft_name],
                  sample_size=sample_size,
                  random_state=1234)

        mfe.fit(X.values, y.values, precomp_groups=precomp_group)

        value = mfe.extract()[1]

        if exp_value is np.nan:
            assert value[0] is exp_value

        else:
            assert np.allclose(value, exp_value)
示例#26
0
def meta_features(X, y, groups=None, suppress=True):
    ''' Extracts and returns the meta-features from a dataset using the Pymfe
    package.

    Parameters:
    -----------
    X: pd.DataFrame
        Contains the dataframe of a given dataset excluding its target column.
    y: pd.Series
        Contains the series of the target of a given dataset.
    groups: list
        Contains the names of the meta-feature groups as available in the
        Pymfe package (pymfe.readthedocs.io).

    Returns:
    --------
    list
        Contains a list of lists where one list denotes the meta-feature names
            and the other denoted the meta-feature values respective to the names.
    '''
    try:
        X = X.to_numpy()
    except:
        pass

    try:
        y = y.to_numpy()
    except:
        pass

    if groups == None:
        mfe = MFE(suppress_warnings=suppress)
        mfe.fit(X, y)
        ft = mfe.extract()
    else:
        mfe = MFE(groups=groups, suppress_warnings=suppress)
        mfe.fit(X, y)
        ft = mfe.extract()

    return ft
示例#27
0
def get_window_features(X, mfe_features, tsfel_config,
                        summary_funcs, n_classes=None,
                        last_window_acc=None, current_acc=None):
    mfe = MFE(features=mfe_features, summary=summary_funcs)
    mfe.fit(X)
    mfe_feats = mfe.extract()

    tsfel_feats = gen_tsfel_features(tsfel_config,
                                     pd.DataFrame(X),
                                     summary=summary_funcs)

    stream_feats = pd.DataFrame(
        {name: [value] for name, value in zip(mfe_feats[0], mfe_feats[1])}
    )
    stream_feats = pd.concat([stream_feats, tsfel_feats], axis=1)

    if last_window_acc is not None and current_acc is not None:
        stream_feats["window_acc_delta"] = current_acc - last_window_acc

    if n_classes is not None:
        stream_feats["n_classes"] = n_classes
        stream_feats["max_possible_entropy"] = math.log(n_classes, 2)

    return stream_feats
示例#28
0
    def test_extract_metafeature_names_unsupervised_02(self, groups, summary):
        """Test .extract_metafeature_names method."""
        X, _ = utils.load_xy(0)

        mfe = MFE(groups=groups, summary=summary)

        mtf_names_1 = mfe.fit(X.values).extract(suppress_warnings=True)[0]
        # Note: by default, .extract_metafeature_names should check wether
        # 'y' was fitted or not if .fit was called before. Therefore, here,
        # supervised=True is expected to be ignored and behave like
        # supervised=False.
        mtf_names_2 = mfe.extract_metafeature_names(supervised=True)
        mtf_names_3 = mfe.extract_metafeature_names(supervised=False)

        assert tuple(mtf_names_1) == mtf_names_2 == mtf_names_3
示例#29
0
 def test_no_cat_transformation(self):
     X, y = utils.load_xy(1)
     mfe = MFE()
     mfe.fit(X.values, y.values, transform_cat=None)
     assert mfe._custom_args_ft["N"].size == 0
示例#30
0
# The standard way to extract meta-features is using the MFE class.
# The parameters are the dataset and the group of measures to be extracted.
# By default, the method extract all the measures. For instance:

from sklearn.datasets import load_iris
from pymfe.mfe import MFE

# Load a dataset
data = load_iris()
y = data.target
X = data.data

###############################################################################
# Extracting all measures
mfe = MFE()
mfe.fit(X, y)
ft = mfe.extract()
print("\n".join("{:50} {:30}".format(x, y) for x, y in zip(ft[0], ft[1])))

###############################################################################
# Extracting general, statistical and information-theoretic measures
mfe = MFE(groups=["general", "statistical", "info-theory"])
mfe.fit(X, y)
ft = mfe.extract()
print("\n".join("{:50} {:30}".format(x, y) for x, y in zip(ft[0], ft[1])))

###############################################################################
# Changing summarization function
# -------------------------------
#
# Several measures return more than one value. To aggregate them, post