Exemplo n.º 1
0
 def round(self, new, a, precision):
     """
     Create ``new`` numeric feature by rounding ``a`` feature value to ``precision`` decimal places.
     """
     Base.train[new] = round(Base.train[a], precision)
     Base.test[new] = round(Base.test[a], precision)
     Base.data_n()
Exemplo n.º 2
0
 def product(self, new, a, b):
     """
     Create ``new`` numeric feature by multiplying ``a`` * ``b`` feature values.
     """
     Base.train[new] = Base.train[a] * Base.train[b]
     Base.test[new] = Base.test[a] * Base.test[b]
     Base.data_n()
Exemplo n.º 3
0
 def word_count(self, new, a):
     """
     Create ``new`` numeric feature based on length or word count from ``a`` feature containing free-form text.
     """
     Base.train[new] = Base.train[a].apply(lambda x: len(x.split(" ")))
     Base.test[new] = Base.test[a].apply(lambda x: len(x.split(" ")))
     Base.data_n()
Exemplo n.º 4
0
 def sum(self, new, a, b):
     """
     Create ``new`` numeric feature by adding ``a`` + ``b`` feature values.
     """
     Base.train[new] = Base.train[a] + Base.train[b]
     Base.test[new] = Base.test[a] + Base.test[b]
     Base.data_n()
Exemplo n.º 5
0
 def diff(self, new, a, b):
     """
     Create ``new`` numeric feature by subtracting ``a`` - ``b`` feature values.
     """
     Base.train[new] = Base.train[a] - Base.train[b]
     Base.test[new] = Base.test[a] - Base.test[b]
     Base.data_n()
Exemplo n.º 6
0
 def mapping(self, a, data):
     """
     Convert values for categorical feature ``a`` using ``data`` dictionary. Use when number of categories are limited otherwise use labels.
     """
     Base.train[a] = Base.train[a].apply(lambda x: data[x])
     Base.test[a] = Base.test[a].apply(lambda x: data[x])
     Base.data_n()
Exemplo n.º 7
0
 def add(self, a, num):
     """
     Update ``a`` numeric feature by adding ``num`` number to each values.
     """
     Base.train[a] = Base.train[a] + num
     Base.test[a] = Base.test[a] + num
     Base.data_n()
Exemplo n.º 8
0
 def shape(self):
     """
     Print shape (samples, features) of train, test datasets and number of numerical features in each dataset.
     """
     Base.data_n()
     message = 'train {} | test {}'
     return message.format(Base.train.shape, Base.test.shape)
Exemplo n.º 9
0
    def __init__(self, train, test, target, uid=None):
        """
        Open datasets ``train`` and ``test`` as CSV or JSON files and store in pandas DataFrames ``Base.train`` and ``Base.test``. Set ``Base.target`` and ``Base.uid`` values based on parameters. Initialize ``Plot``, ``Feature``, and ``Xgb`` components.
        """
        Base.version = 'v0.9.1'
        Base.outpath = 'output/'
        Base.inpath = '../input/'

        Base.target = target

        if train.endswith('.csv'):
            Base.train = pd.read_csv(Base.inpath + train)
            Base.test = pd.read_csv(Base.inpath + test)

        if train.endswith('.json'):
            Base.train = pd.read_json(Base.inpath + train)
            Base.test = pd.read_json(Base.inpath + test)

        if not Base.train.empty and not Base.test.empty:
            if uid:
                Base.uid = Base.test.pop(uid)
                Base.train = Base.train.drop([uid], axis=1)

            self.plot = Plot()
            self.feature = Feature()
            self.xgb = Xgb()
            self.model = Model()

            Base.data_n()
        else:
            print('ERROR: SpeedML can only process .csv and .json file extensions.')
Exemplo n.º 10
0
 def list_len(self, new, a):
     """
     Create ``new`` numeric feature based on length or item count from ``a`` feature containing list object as values.
     """
     Base.train[new] = Base.train[a].apply(len)
     Base.test[new] = Base.test[a].apply(len)
     Base.data_n()
Exemplo n.º 11
0
    def __init__(self, train, test, target, uid=None):
        """
        Open datasets ``train`` and ``test`` as CSV or JSON files and store in pandas DataFrames ``Base.train`` and ``Base.test``. Set ``Base.target`` and ``Base.uid`` values based on parameters. Initialize ``Plot``, ``Feature``, and ``Xgb`` components.
        """
        self._setup_environment()

        Base.target = target

        # TODO: Add more file formats supported by pandas.read_
        if train.endswith('.csv'):
            Base.train = pd.read_csv(train)
            Base.test = pd.read_csv(test)

        if train.endswith('.json'):
            Base.train = pd.read_json(train)
            Base.test = pd.read_json(test)

        if not Base.train.empty and not Base.test.empty:
            if uid:
                Base.uid = Base.test.pop(uid)
                Base.train = Base.train.drop([uid], axis=1)

            self.plot = Plot()
            self.feature = Feature()
            self.xgb = Xgb()
            self.model = Model()

            Base.data_n()
        else:
            print(
                'ERROR: SpeedML can only process .csv and .json file extensions.'
            )
Exemplo n.º 12
0
 def distribute(self):
     """
     Plot multiple feature distribution histogram plots for all numeric features. This helps understand skew of distribution from normal to quickly and relatively identify outliers in the dataset.
     """
     Base.data_n()
     features = len(Base.train_n.columns)
     plt.figure()
     Base.train_n.hist(figsize=(features * 1.1, features * 1.1))
Exemplo n.º 13
0
 def density(self, a):
     """
     Create new feature named ``a`` feature name + suffix '_density', based on density or value_counts for each unique value in ``a`` feature.
     """
     vals = Base.train[a].value_counts()
     dvals = vals.to_dict()
     Base.train[a + '_density'] = Base.train[a].apply(lambda x: dvals.get(x, vals.min()))
     Base.test[a + '_density'] = Base.test[a].apply(lambda x: dvals.get(x, vals.min()))
     Base.data_n()
Exemplo n.º 14
0
 def ordinal(self, y):
     """
     Plot ordinal features (categorical numeric) using Violin plot against target feature. Use this to determine outliers within ordinal features spread across associated target feature values.
     """
     Base.data_n()
     plt.figure(figsize=(8, 4))
     sns.violinplot(x=Base.target, y=y, data=Base.train_n)
     plt.xlabel(Base.target, fontsize=12)
     plt.ylabel(y, fontsize=12)
     plt.show()
Exemplo n.º 15
0
 def divide(self, new, a, b):
     """
     Create ``new`` numeric feature by dividing ``a`` / ``b`` feature values. Replace division-by-zero with zero values.
     """
     Base.train[new] = Base.train[a] / Base.train[b]
     Base.test[new] = Base.test[a] / Base.test[b]
     # Histograms require finite values
     Base.train[new] = Base.train[new].replace([np.inf, -np.inf], 0)
     Base.test[new] = Base.test[new].replace([np.inf, -np.inf], 0)
     Base.data_n()
Exemplo n.º 16
0
 def importance(self):
     """
     Plot importance of features based on ExtraTreesClassifier.
     """
     Base.data_n()
     X = Base.train_n
     y = X[Base.target].copy()
     X = X.drop([Base.target], axis=1)
     model = ExtraTreesClassifier()
     model.fit(X, y)
     self._plot_importance(X.columns, model.feature_importances_)
Exemplo n.º 17
0
 def continuous(self, y):
     """
     Plot continuous features (numeric) using scatter plot. Use this to determine outliers within continuous features.
     """
     Base.data_n()
     plt.figure(figsize=(8, 6))
     plt.scatter(range(Base.train_n.shape[0]),
                 np.sort(Base.train_n[y].values))
     plt.xlabel('Samples', fontsize=12)
     plt.ylabel(y, fontsize=12)
     plt.show()
Exemplo n.º 18
0
 def xgb_importance(self):
     """
     Plot importance of features based on XGBoost.
     """
     Base.data_n()
     X = Base.train_n
     X = X.drop([Base.target], axis=1)
     self._create_feature_map(X.columns)
     fscore = Base.xgb_model.booster().get_fscore(
         fmap=Base._config['outpath'] + 'xgb.fmap')
     self._plot_importance(list(fscore.keys()), list(fscore.values()))
Exemplo n.º 19
0
    def drop(self, features):
        """
        Drop one or more list of strings naming ``features`` from train and test datasets.
        """
        start = Base.train.shape[1]

        Base.train = Base.train.drop(features, axis=1)
        Base.test = Base.test.drop(features, axis=1)
        Base.data_n()

        end = Base.train.shape[1]
        message = 'Dropped {} features with {} features available.'
        return message.format(start - end, end)
Exemplo n.º 20
0
 def correlate(self):
     """
     Plot correlation matrix heatmap for numerical features of the training dataset. Use this plot to understand if certain features are duplicate, are of low importance, or possibly high importance for our model.
     """
     Base.data_n()
     corr = Base.train_n.corr()
     features = Base.train_n.shape[1]
     cell_size = features * 1.2 if features < 9 else features * 0.5
     plt.figure(figsize=(cell_size, cell_size))
     sns.heatmap(corr,
                 vmax=1,
                 linewidths=.5,
                 square=True,
                 annot=True if features < 9 else False)
     plt.title('feature correlations in train_n dataset')
Exemplo n.º 21
0
    def labels(self, features):
        """
        Generate numerical labels replacing text values from list of categorical ``features``.
        """
        Base.test[Base.target] = -1
        combine = Base.train.append(Base.test)

        le = LabelEncoder()
        for feature in features:
            combine[feature] = le.fit_transform(combine[feature])

        Base.train = combine[0:Base.train.shape[0]]
        Base.test = combine[Base.train.shape[0]::]
        Base.test = Base.test.drop([Base.target], axis=1)
        Base.data_n()
Exemplo n.º 22
0
    def impute(self):
        """
        Replace empty values in the entire dataframe with median value for numerical features and most common values for text features.
        """
        start = Base.train.isnull().sum().sum()

        Base.test[Base.target] = -1
        combine = Base.train.append(Base.test)
        combine = DataFrameImputer().fit_transform(combine)
        Base.train = combine[0:Base.train.shape[0]]
        Base.test = combine[Base.train.shape[0]::]
        Base.test = Base.test.drop([Base.target], axis=1)
        Base.data_n()

        end = Base.train.isnull().sum().sum()
        message = 'Imputed {} empty values to {}.'
        return message.format(start, end)
Exemplo n.º 23
0
    def feature_selection(self):
        """
        Returns threshold and accuracy for ``n`` number of features.
        """
        Base.data_n()
        X = Base.train_n.drop([Base.target], axis=1)
        Y = Base.train[Base.target]

        # Split data into train and test sets
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            Y,
                                                            test_size=0.3,
                                                            random_state=7)

        # Fit model on all training data
        model = xgb.XGBClassifier()
        model.fit(X_train, y_train)

        # Make predictions for test data and evaluate
        y_pred = model.predict(X_test)
        predictions = [round(value) for value in y_pred]
        accuracy = accuracy_score(y_test, predictions)
        self.feature_accuracy = round(accuracy * 100.0, 2)
        print("Accuracy: %f%%" % (self.feature_accuracy))

        # Fit model using each importance as a threshold
        thresholds = np.sort(model.feature_importances_)
        for thresh in thresholds:
            # Select features using threshold
            selection = SelectFromModel(model, threshold=thresh, prefit=True)
            select_X_train = selection.transform(X_train)

            # Train model
            selection_model = xgb.XGBClassifier()
            selection_model.fit(select_X_train, y_train)

            # Evalation model
            select_X_test = selection.transform(X_test)
            y_pred = selection_model.predict(select_X_test)
            predictions = [round(value) for value in y_pred]
            accuracy = accuracy_score(y_test, predictions)
            print("Thresh=%.3f, n=%d, Accuracy: %.2f%%" %
                  (thresh, select_X_train.shape[1], accuracy * 100.0))
Exemplo n.º 24
0
    def outliers(self, a, lower = None, upper = None):
        """
        Fix outliers for ``lower`` or ``upper`` or both percentile of values within ``a`` feature.
        """
        if upper:
            upper_value = np.percentile(Base.train[a].values, upper)
            change = Base.train.loc[Base.train[a] > upper_value, a].shape[0]
            Base.train.loc[Base.train[a] > upper_value, a] = upper_value
            Base.data_n()
            message = 'Fixed {} or {:.2f}% upper outliers. '.format(change, change/Base.train.shape[0]*100)

        if lower:
            lower_value = np.percentile(Base.train[a].values, lower)
            change = Base.train.loc[Base.train[a] < lower_value, a].shape[0]
            Base.train.loc[Base.train[a] < lower_value, a] = lower_value
            Base.data_n()
            message = message + 'Fixed {} or {:.2f}% lower outliers.'.format(change, change/Base.train.shape[0]*100)

        return message
Exemplo n.º 25
0
    def eda(self):
        """
        Performs speed exploratory data analysis (EDA) on the current state of datasets. Returns metrics and recommendations as a dataframe. Progressively hides metrics as they achieve workflow completion goals or meet the configured defaults and thresholds.
        """
        Base.data_n()

        eda_metrics = []

        eda_index = ['Speedml Release']
        eda_metrics.append([_RELEASE, 'Visit https://speedml.com for release notes.'])

        nulls_by_features = Base.train.isnull().sum() + Base.test.isnull().sum()
        nulls = nulls_by_features[1].sum()
        if nulls:
            eda_index.append('Nulls')
            eda_metrics.append([nulls, 'Use feature.impute.'])

        skew = Base.train_n.skew()
        skew_upper = skew[skew > Base._config['outlier_threshold']]
        skew_lower = skew[skew < -Base._config['outlier_threshold']]
        if not skew_upper.empty:
            eda_index.append('Outliers Upper')
            eda_metrics.append(
                [skew_upper.axes[0].tolist(),
                 'Positive skew (> {}). Use feature.outliers(upper).'.format(
                     Base._config['outlier_threshold'])])
        if not skew_lower.empty:
            eda_index.append('Outliers Lower')
            eda_metrics.append(
                [skew_lower.axes[0].tolist(),
                 'Negative skew (< -{}). Use feature.outliers(lower).'.format(
                     Base._config['outlier_threshold'])])

        eda_index.append('Shape')
        feature_by_sample = Base.train.shape[1] / Base.train.shape[1]
        message = '#Features / #Samples > {}. Over-fitting.'.format(Base._config['overfit_threshold'])
        message = message if feature_by_sample < Base._config['overfit_threshold'] else ''
        eda_metrics.append([self.shape(), message])

        numerical_ratio = int(Base.train_n.shape[1] / Base.train.shape[1] * 100)
        if numerical_ratio < 100:
            eda_index.append('Numerical Ratio')
            eda_metrics.append(['{}%'.format(numerical_ratio),
                                  'Aim for 100% numerical.'])

        numerical_features = Base.train_n.columns.values

        if numerical_features != []:
            high_cardinality_num = []
            categorical_num = []
            continuous = []
            for feature in numerical_features:
                repeating = Base.train[feature].value_counts()
                if repeating.count() > (Base._config['unique_ratio'])/100*Base.train.shape[0]:
                    continuous.append(feature)
                    if feature == Base.target:
                        target_analysis = ['Model ready.',
                                           'Use regression models.']
                    continue
                if repeating.count() > Base._config['high_cardinality']:
                    high_cardinality_num.append(feature)
                    if feature == Base.target:
                        target_analysis = ['Pre-process.',
                                           'Dimensionality reduction?']
                    continue
                if repeating.count() > 1:
                    categorical_num.append(feature)
                    if feature == Base.target:
                        target_analysis = ['Model ready.',
                                           'Use classification models.']
                    continue

            if high_cardinality_num:
                eda_index.append('Numerical High-cardinality')
                eda_metrics.append([
                    high_cardinality_num,
                    '(>{}) categories. Use feature.density'.format(
                        Base._config['high_cardinality'])])

            if categorical_num:
                eda_index.append('Numerical Categorical')
                eda_metrics.append([
                    categorical_num,
                    ' Use plot.ordinal.'])

            if continuous:
                eda_index.append('Numerical Continuous')
                eda_metrics.append([
                    continuous,
                    '~{}% unique. Use plot.continuous.'.format(Base._config['unique_ratio'])])

        if Base.train_n.shape[1] != Base.train.shape[1]:
            text_features = []
            text_features = list(set(Base.train.columns.values) - set(numerical_features))

            if text_features != []:
                high_cardinality_text = []
                categorical_text = []
                text = []
                for feature in text_features:
                    repeating = Base.train[feature].value_counts()
                    if repeating.count() > (Base._config['unique_ratio'])/100*Base.train.shape[0]:
                        text.append(feature)
                        if feature == Base.target:
                            target_analysis = [
                                'ERROR.',
                                'Unique text cannot be a target variable.']
                        continue
                    if repeating.count() > Base._config['high_cardinality']:
                        high_cardinality_text.append(feature)
                        if feature == Base.target:
                            target_analysis = [
                                'Pre-process.',
                                'Use feature.labels.']
                        continue
                    if repeating.count() > 1:
                        categorical_text.append(feature)
                        if feature == Base.target:
                            target_analysis = [
                                'Pre-process.',
                                'Use feature.labels or feature.mapping.']
                        continue

                if high_cardinality_text:
                    eda_index.append('Text High-cardinality')
                    eda_metrics.append([
                        high_cardinality_text,
                        '(>{}) categories. Use feature.labels.'.format(Base._config['high_cardinality'])])

                if categorical_text:
                    eda_index.append('Text Categorical')
                    eda_metrics.append([
                        categorical_text,
                        'Use feature.labels or feature.mapping.'])

                if text:
                    eda_index.append('Text Unique')
                    eda_metrics.append([
                        text,
                        '~{}% unique. Use feature.extract or feature.drop.'.format(Base._config['unique_ratio'])])

        eda_index += ['Target Analysis ({})'.format(Base.target)]
        eda_metrics.append(target_analysis)

        eda_df = pd.DataFrame(eda_metrics,
                                index=eda_index,
                                columns=['Results', 'Observations'])

        return eda_df