Exemplo n.º 1
0
def spacy_feature_postag(
    x_train, x_test=None, list_of_cols=[], new_col_name="_postagged"
):
    """
    Part of Speech tag the text data provided. Used to tag each word as a Noun, Adjective,
    Verbs, etc.

    This utilizes the spacy NLP engine.
    
    Parameters
    ----------
    x_train : DataFrame
        Dataset

    x_test : DataFrame
        Testing dataset, by default None

    list_of_cols : list, optional
        A list of specific columns to apply this technique to, by default []

    new_col_name : str, optional
        New column name to be created when applying this technique, by default `COLUMN_postagged`
    
    Returns
    -------
    Dataframe, *Dataframe
        Transformed dataframe with the new column.

    Returns 2 Dataframes if x_test is provided. 
    """

    list_of_cols = _get_columns(list_of_cols, x_train)

    nlp = spacy.load("en_core_web_sm")

    for col in list_of_cols:
        transformed_text = map(nlp, x_train[col])
        x_train[col + new_col_name] = pd.Series(
            map(
                lambda x: list(map(lambda token: (token, token.pos_), x)),
                transformed_text,
            )
        )

        if x_test is not None:
            transformed_text = map(nlp, x_test[col])
            x_test[col + new_col_name] = pd.Series(
                map(
                    lambda x: list(map(lambda token: (token, token.pos_), x)),
                    transformed_text,
                )
            )

    return x_train, x_test
Exemplo n.º 2
0
def replace_missing_mean_median_mode(x_train,
                                     x_test=None,
                                     list_of_cols=[],
                                     strategy=""):
    """
    Replaces missing values in every numeric column with the mean, median or mode of that column specified by strategy.

    Mean: Average value of the column. Effected by outliers.
    Median: Middle value of a list of numbers. Equal to the mean if x_train follows normal distribution. Not effected much by anomalies.
    Mode: Most common number in a list of numbers.
    
    Parameters
    ----------
    x_train: Dataframe or array like - 2d
        Dataset

    x_test: Dataframe or array like - 2d
        Testing dataset, by default None.

    list_of_cols : list, optional
        A list of specific columns to apply this technique to
        If `list_of_cols` is not provided, the strategy will be
        applied to all numeric columns., by default []

    strategy : str
        Strategy for replacing missing values.
        Can be either "mean", "median" or "most_frequent"
    
    Returns
    -------
    Dataframe, *Dataframe
        Transformed dataframe with rows with a missing values in a specific column are missing

    Returns 2 Dataframes test if x_test is provided.  
    """

    if strategy != "most_frequent":
        list_of_cols = _numeric_input_conditions(list_of_cols, x_train)
    else:
        list_of_cols = _get_columns(list_of_cols, x_train)

    imp = SimpleImputer(strategy=strategy)

    fit_data = imp.fit_transform(x_train[list_of_cols])
    fit_df = pd.DataFrame(fit_data, columns=list_of_cols)
    x_train = drop_replace_columns(x_train, list_of_cols, fit_df)

    if x_test is not None:
        fit_x_test = imp.transform(x_test[list_of_cols])
        fit_test_df = pd.DataFrame(fit_x_test, columns=list_of_cols)
        x_test = drop_replace_columns(x_test, list_of_cols, fit_test_df)

    return x_train, x_test
Exemplo n.º 3
0
def spacy_feature_noun_phrases(
    x_train, x_test=None, list_of_cols=[], new_col_name="_phrases"
):
    """
    Extracts noun phrases from the given data.

    This utilizes the spacy NLP engine.
    
    Parameters
    ----------
    x_train : DataFrame
        Dataset

    x_test : DataFrame
        Testing dataset, by default None

    list_of_cols : list, optional
        A list of specific columns to apply this technique to, by default []

    new_col_name : str, optional
        New column name to be created when applying this technique, by default `COLUMN_phrases`
    
    Returns
    -------
    Dataframe, *Dataframe
        Transformed dataframe with the new column.

    Returns 2 Dataframes if x_test is provided. 
    """

    list_of_cols = _get_columns(list_of_cols, x_train)

    nlp = spacy.load("en")

    for col in list_of_cols:
        transformed_text = list(map(nlp, x_train[col]))
        x_train[col + new_col_name] = pd.Series(
            map(lambda x: [str(phrase) for phrase in x.noun_chunks], transformed_text)
        )

        if x_test is not None:
            transformed_text = map(nlp, x_test[col])
            x_test[col + new_col_name] = pd.Series(
                map(lambda x: x.noun_chunks, transformed_text)
            )

    return x_train, x_test
Exemplo n.º 4
0
def feature_one_hot_encode(
    x_train, x_test=None, list_of_cols=[], keep_col=True, **algo_kwargs
):
    """
    Creates a matrix of converted categorical columns into binary columns of ones and zeros.
    
    Parameters
    ----------
    x_train : DataFrame
        Dataset

    x_test : DataFrame
        Testing dataset, by default None

    list_of_cols : list
         A list of specific columns to apply this technique to.

    keep_col : bool
        A parameter to specify whether to drop the column being transformed, by default
        keep the column, True

    algo_kwargs : optional
        Parameters you would pass into Bag of Words constructor as a dictionary, by default {"handle_unknown": "ignore"}
    
    Returns
    -------
    Dataframe, *Dataframe
        Transformed dataframe with the new column.

    Returns 2 Dataframes if x_test is provided. 
    """

    enc = OneHotEncoder(handle_unknown="ignore", **algo_kwargs)
    list_of_cols = _get_columns(list_of_cols, x_train)

    enc_data = enc.fit_transform(x_train[list_of_cols]).toarray()
    enc_df = pd.DataFrame(enc_data, columns=enc.get_feature_names(list_of_cols))
    x_train = drop_replace_columns(x_train, list_of_cols, enc_df, keep_col)

    if x_test is not None:
        enc_x_test = enc.transform(x_test[list_of_cols]).toarray()
        enc_test_df = pd.DataFrame(
            enc_x_test, columns=enc.get_feature_names(list_of_cols)
        )
        x_test = drop_replace_columns(x_test, list_of_cols, enc_test_df, keep_col)

    return x_train, x_test
Exemplo n.º 5
0
def feature_hash_vectorizer(
    x_train, x_test=None, list_of_cols=[], keep_col=True, **hashing_kwargs
):
    """
    Returns a hashed encoding of text data.
    
    Parameters
    ----------
    x_train : DataFrame
        Training dataset, by default None
        
    x_test : DataFrame
        Testing dataset, by default None

    list_of_cols : list, optional
        A list of specific columns to apply this technique to., by default []

    keep_col : bool, optional
        True if you want to keep the columns passed, otherwise remove it.

    hashing_kwargs : dict, optional
        Parameters you would pass into Hashing Vectorizer constructor, by default {}
    
    Returns
    -------
    Dataframe, *Dataframe
        Transformed dataframe with the new column.

    Returns 2 Dataframes if x_test is provided. 
    """

    enc = HashingVectorizer(**hashing_kwargs)
    list_of_cols = _get_columns(list_of_cols, x_train)

    for col in list_of_cols:
        enc_data = enc.fit_transform(x_train[col]).toarray()
        enc_df = pd.DataFrame(enc_data)
        x_train = drop_replace_columns(x_train, col, enc_df, keep_col)

        if x_test is not None:
            enc_x_test = enc.transform(x_test[col]).toarray()
            enc_test_df = pd.DataFrame(enc_x_test)
            x_test = drop_replace_columns(x_test, col, enc_test_df, keep_col)

    return x_train, x_test
Exemplo n.º 6
0
def feature_bag_of_words(
    x_train, x_test=None, list_of_cols=[], keep_col=False, **algo_kwargs
):
    """
    Creates a matrix of how many times a word appears in a document.
    
    Parameters
    ----------
    x_train : DataFrame
        Training dataset, by default None
        
    x_test : DataFrame
        Testing dataset, by default None

    list_of_cols : list, optional
        A list of specific columns to apply this technique to., by default []

    keep_col : bool, optional
        True if you want to keep the columns passed, otherwise remove it.

    algo_kwargs : dict, optional
        Parameters you would pass into Bag of Words constructor as a dictionary., by default {}
    
    Returns
    -------
    Dataframe, *Dataframe
        Transformed dataframe with the new column.

    Returns 2 Dataframes if x_test is provided. 
    """

    enc = CountVectorizer(**algo_kwargs)
    list_of_cols = _get_columns(list_of_cols, x_train)

    for col in list_of_cols:
        enc_data = enc.fit_transform(x_train[col]).toarray()
        enc_df = pd.DataFrame(enc_data, columns=enc.get_feature_names())
        x_train = drop_replace_columns(x_train, col, enc_df, keep_col)

        if x_test is not None:
            enc_x_test = enc.transform(x_test[col]).toarray()
            enc_test_df = pd.DataFrame(enc_x_test, columns=enc.get_features_names())
            x_test = drop_replace_columns(x_test, col, enc_test_df, keep_col)

    return x_train, x_test
Exemplo n.º 7
0
def feature_tfidf(x_train, x_test=None, list_of_cols=[], keep_col=True, **algo_kwargs):
    """
    Creates a matrix of the tf-idf score for every word in the corpus as it pertains to each document.
    
    Either the full data or training data plus testing data MUST be provided, not both.
    
    Parameters
    ----------
    x_train : DataFrame
        Dataset
        
    x_test : DataFrame
        Testing dataset, by default None

    list_of_cols : list, optional
        A list of specific columns to apply this technique to, by default []

    keep_col : bool, optional
        True if you want to keep the columns passed, otherwise remove it.

    algo_kwargs :  optional
        Parameters you would pass into TFIDF constructor, by default {}
    
    Returns
    -------
    Dataframe, *Dataframe
        Transformed dataframe with the new column

    Returns 2 Dataframes if x_test data is provided. 
    """

    enc = TfidfVectorizer(**algo_kwargs)
    list_of_cols = _get_columns(list_of_cols, x_train)

    for col in list_of_cols:
        enc_data = enc.fit_transform(x_train[col]).toarray()
        enc_df = pd.DataFrame(enc_data, columns=enc.get_feature_names())
        x_train = drop_replace_columns(x_train, col, enc_df, keep_col)

        if x_test is not None:
            enc_x_test = enc.transform(x_test[col]).toarray()
            enc_test_df = pd.DataFrame(enc_x_test, columns=enc.get_feature_names())
            x_test = drop_replace_columns(x_test, col, enc_test_df, keep_col)

    return x_train, x_test
Exemplo n.º 8
0
def nltk_feature_postag(
    x_train, x_test=None, list_of_cols=[], new_col_name="_postagged"
):
    """
    Part of Speech tag the text data provided. Used to tag each word as a Noun, Adjective,
    Verbs, etc.

    This utilizes TextBlob which utlizes the NLTK tagger and is a wrapper for the tagging process.
    
    Parameters
    ----------
    x_train : DataFrame
        Dataset

    x_test : DataFrame
        Testing dataset, by default None

    list_of_cols : list, optional
        A list of specific columns to apply this technique to, by default []

    new_col_name : str, optional
        New column name to be created when applying this technique, by default `COLUMN_postagged`
    
    Returns
    -------
    Dataframe, *Dataframe
        Transformed dataframe with the new column.

    Returns 2 Dataframes if x_test is provided. 
    """

    list_of_cols = _get_columns(list_of_cols, x_train)

    for col in list_of_cols:
        x_train[col + new_col_name] = pd.Series(
            map(lambda x: TextBlob(x).tags, x_train[col])
        )

        if x_test is not None:
            x_test[col + new_col_name] = pd.Series(
                map(lambda x: TextBlob(x).tags, x_test[col])
            )

    return x_train, x_test
Exemplo n.º 9
0
def nltk_feature_noun_phrases(
    x_train, x_test=None, list_of_cols=[], new_col_name="_phrases"
):
    """
    Extracts noun phrases from the given text.

    This utilizes TextBlob which utlizes the NLTK NLP engine.
    
    Parameters
    ----------
    x_train : DataFrame
        Dataset

    x_test : DataFrame
        Testing dataset, by default None

    list_of_cols : list, optional
        A list of specific columns to apply this technique to, by default []

    new_col_name : str, optional
        New column name to be created when applying this technique, by default `COLUMN_phrases`
    
    Returns
    -------
    Dataframe, *Dataframe
        Transformed dataframe with the new column.

    Returns 2 Dataframes if x_test is provided. 
    """

    list_of_cols = _get_columns(list_of_cols, x_train)

    for col in list_of_cols:
        x_train[col + new_col_name] = pd.Series(
            map(lambda x: TextBlob(x).noun_phrases, x_train[col])
        )

        if x_test is not None:
            x_test[col + new_col_name] = pd.Series(
                map(lambda x: TextBlob(x).noun_phrases, x_test[col])
            )

    return x_train, x_test
Exemplo n.º 10
0
def replace_missing_new_category(
    x_train, x_test=None, col_to_category=None, constant=None
):
    """
    Replaces missing values in categorical column with its own category. The categories can be autochosen
    from the defaults set.
    
    Parameters
    ----------
    x_train : DataFrame
        Dataset
        
    x_test : DataFrame
        Testing Dataset, by default None
        
    col_to_category : list or dict, optional
        A dictionary mapping column name to the category name you want to replace , by default None

    constant : str, int or float, optional
        Category placeholder value for missing values, by default None
    
    Returns
    -------
    Dataframe, *Dataframe:
        Cleaned columns of the Dataframe(s) provides with the provided constant.
        
    Returns 2 Dataframes if x_test is provided.

    Examples
    --------
    >>> ReplaceMissingCategory({'a': "Green", 'b': "Canada", 'c': "December"})
    >>> ReplaceMissingCategory("Blue", ['a', 'b', 'c'])
    """

    if isinstance(col_to_category, list):
        col_to_category = _get_columns(col_to_category, x_train)

    str_missing_categories = ["Other", "Unknown", "Missingx_trainCategory"]
    num_missing_categories = [-1, -999, -9999]

    if isinstance(col_to_category, dict):

        for col in col_to_category.keys():
            x_train[col].fillna(col_to_category[col], inplace=True)

            if x_test is not None:
                x_test[col].fillna(col_to_category[col], inplace=True)

    elif isinstance(col_to_category, list) and constant is not None:

        for col in col_to_category:
            x_train[col].fillna(constant, inplace=True)

            if x_test is not None:
                x_test[col].fillna(constant, inplace=True)

    else:

        for col in col_to_category:
            # Check if column is a number
            if np.issubdtype(x_train[col].dtype, np.number):
                new_category_name = _determine_default_category(
                    x_train, col, num_missing_categories
                )
                x_train[col].fillna(new_category_name, inplace=True)

                # Convert numeric categorical column to integer
                x_train[col] = x_train[col].astype(int)

                if x_test is not None:
                    x_test[col].fillna(new_category_name, inplace=True)
                    # Convert numeric categorical column to integer
                    x_test[col] = x_test[col].astype(int)
            else:
                new_category_name = _determine_default_category(
                    x_train, col, str_missing_categories
                )
                x_train[col].fillna(new_category_name, inplace=True)

                if x_test is not None:
                    new_category_name = _determine_default_category(
                        x_train, col, str_missing_categories
                    )
                    x_test[col].fillna(new_category_name, inplace=True)

    return x_train, x_test
Exemplo n.º 11
0
    def groupby_analysis(self, groupby: list, *cols, data_filter=None):
        """
        Groups your data and then provides descriptive statistics for the other columns on the grouped data.

        For numeric data, the descriptive statistics are:

            - count
            - min
            - max
            - mean
            - standard deviation
            - variance
            - median
            - most common
            - sum
            - Median absolute deviation
            - number of unique values

        For other types of data:

            - count
            - most common
            - number of unique values
        
        Parameters
        ----------
        groupby : list
            List of columns to groupby.

        cols : str(s)
            Columns you want statistics on, if none are provided, it will provide statistics for every column.

        data_filter : Dataframe, optional
            Filtered dataframe, by default None
        
        Returns
        -------
        Dataframe
            Dataframe of grouped columns and statistics for each column.
        """

        analysis = {}
        numeric_analysis = [
            "count",
            "min",
            "max",
            "mean",
            "std",
            "var",
            "median",
            ("most_common", lambda x: pd.Series.mode(x)[0]),
            "sum",
            "mad",
            "nunique",
        ]
        other_analysis = [
            "count",
            ("most_common", lambda x: pd.Series.mode(x)[0]),
            "nunique",
        ]

        list_of_cols = _get_columns(list(cols), self._data_properties.x_train)

        if isinstance(data_filter, pd.DataFrame):
            data = data_filter
        else:
            data = self._data_properties.x_train.copy()

        for col in list_of_cols:
            if col not in groupby:
                # biufc - bool, int, unsigned, float, complex
                if data[col].dtype.kind in "biufc":
                    analysis[col] = numeric_analysis
                else:
                    analysis[col] = other_analysis

        analyzed_data = data.groupby(groupby).agg(analysis)

        return analyzed_data