def spacy_feature_postag( x_train, x_test=None, list_of_cols=[], new_col_name="_postagged" ): """ Part of Speech tag the text data provided. Used to tag each word as a Noun, Adjective, Verbs, etc. This utilizes the spacy NLP engine. Parameters ---------- x_train : DataFrame Dataset x_test : DataFrame Testing dataset, by default None list_of_cols : list, optional A list of specific columns to apply this technique to, by default [] new_col_name : str, optional New column name to be created when applying this technique, by default `COLUMN_postagged` Returns ------- Dataframe, *Dataframe Transformed dataframe with the new column. Returns 2 Dataframes if x_test is provided. """ list_of_cols = _get_columns(list_of_cols, x_train) nlp = spacy.load("en_core_web_sm") for col in list_of_cols: transformed_text = map(nlp, x_train[col]) x_train[col + new_col_name] = pd.Series( map( lambda x: list(map(lambda token: (token, token.pos_), x)), transformed_text, ) ) if x_test is not None: transformed_text = map(nlp, x_test[col]) x_test[col + new_col_name] = pd.Series( map( lambda x: list(map(lambda token: (token, token.pos_), x)), transformed_text, ) ) return x_train, x_test
def replace_missing_mean_median_mode(x_train, x_test=None, list_of_cols=[], strategy=""): """ Replaces missing values in every numeric column with the mean, median or mode of that column specified by strategy. Mean: Average value of the column. Effected by outliers. Median: Middle value of a list of numbers. Equal to the mean if x_train follows normal distribution. Not effected much by anomalies. Mode: Most common number in a list of numbers. Parameters ---------- x_train: Dataframe or array like - 2d Dataset x_test: Dataframe or array like - 2d Testing dataset, by default None. list_of_cols : list, optional A list of specific columns to apply this technique to If `list_of_cols` is not provided, the strategy will be applied to all numeric columns., by default [] strategy : str Strategy for replacing missing values. Can be either "mean", "median" or "most_frequent" Returns ------- Dataframe, *Dataframe Transformed dataframe with rows with a missing values in a specific column are missing Returns 2 Dataframes test if x_test is provided. """ if strategy != "most_frequent": list_of_cols = _numeric_input_conditions(list_of_cols, x_train) else: list_of_cols = _get_columns(list_of_cols, x_train) imp = SimpleImputer(strategy=strategy) fit_data = imp.fit_transform(x_train[list_of_cols]) fit_df = pd.DataFrame(fit_data, columns=list_of_cols) x_train = drop_replace_columns(x_train, list_of_cols, fit_df) if x_test is not None: fit_x_test = imp.transform(x_test[list_of_cols]) fit_test_df = pd.DataFrame(fit_x_test, columns=list_of_cols) x_test = drop_replace_columns(x_test, list_of_cols, fit_test_df) return x_train, x_test
def spacy_feature_noun_phrases( x_train, x_test=None, list_of_cols=[], new_col_name="_phrases" ): """ Extracts noun phrases from the given data. This utilizes the spacy NLP engine. Parameters ---------- x_train : DataFrame Dataset x_test : DataFrame Testing dataset, by default None list_of_cols : list, optional A list of specific columns to apply this technique to, by default [] new_col_name : str, optional New column name to be created when applying this technique, by default `COLUMN_phrases` Returns ------- Dataframe, *Dataframe Transformed dataframe with the new column. Returns 2 Dataframes if x_test is provided. """ list_of_cols = _get_columns(list_of_cols, x_train) nlp = spacy.load("en") for col in list_of_cols: transformed_text = list(map(nlp, x_train[col])) x_train[col + new_col_name] = pd.Series( map(lambda x: [str(phrase) for phrase in x.noun_chunks], transformed_text) ) if x_test is not None: transformed_text = map(nlp, x_test[col]) x_test[col + new_col_name] = pd.Series( map(lambda x: x.noun_chunks, transformed_text) ) return x_train, x_test
def feature_one_hot_encode( x_train, x_test=None, list_of_cols=[], keep_col=True, **algo_kwargs ): """ Creates a matrix of converted categorical columns into binary columns of ones and zeros. Parameters ---------- x_train : DataFrame Dataset x_test : DataFrame Testing dataset, by default None list_of_cols : list A list of specific columns to apply this technique to. keep_col : bool A parameter to specify whether to drop the column being transformed, by default keep the column, True algo_kwargs : optional Parameters you would pass into Bag of Words constructor as a dictionary, by default {"handle_unknown": "ignore"} Returns ------- Dataframe, *Dataframe Transformed dataframe with the new column. Returns 2 Dataframes if x_test is provided. """ enc = OneHotEncoder(handle_unknown="ignore", **algo_kwargs) list_of_cols = _get_columns(list_of_cols, x_train) enc_data = enc.fit_transform(x_train[list_of_cols]).toarray() enc_df = pd.DataFrame(enc_data, columns=enc.get_feature_names(list_of_cols)) x_train = drop_replace_columns(x_train, list_of_cols, enc_df, keep_col) if x_test is not None: enc_x_test = enc.transform(x_test[list_of_cols]).toarray() enc_test_df = pd.DataFrame( enc_x_test, columns=enc.get_feature_names(list_of_cols) ) x_test = drop_replace_columns(x_test, list_of_cols, enc_test_df, keep_col) return x_train, x_test
def feature_hash_vectorizer( x_train, x_test=None, list_of_cols=[], keep_col=True, **hashing_kwargs ): """ Returns a hashed encoding of text data. Parameters ---------- x_train : DataFrame Training dataset, by default None x_test : DataFrame Testing dataset, by default None list_of_cols : list, optional A list of specific columns to apply this technique to., by default [] keep_col : bool, optional True if you want to keep the columns passed, otherwise remove it. hashing_kwargs : dict, optional Parameters you would pass into Hashing Vectorizer constructor, by default {} Returns ------- Dataframe, *Dataframe Transformed dataframe with the new column. Returns 2 Dataframes if x_test is provided. """ enc = HashingVectorizer(**hashing_kwargs) list_of_cols = _get_columns(list_of_cols, x_train) for col in list_of_cols: enc_data = enc.fit_transform(x_train[col]).toarray() enc_df = pd.DataFrame(enc_data) x_train = drop_replace_columns(x_train, col, enc_df, keep_col) if x_test is not None: enc_x_test = enc.transform(x_test[col]).toarray() enc_test_df = pd.DataFrame(enc_x_test) x_test = drop_replace_columns(x_test, col, enc_test_df, keep_col) return x_train, x_test
def feature_bag_of_words( x_train, x_test=None, list_of_cols=[], keep_col=False, **algo_kwargs ): """ Creates a matrix of how many times a word appears in a document. Parameters ---------- x_train : DataFrame Training dataset, by default None x_test : DataFrame Testing dataset, by default None list_of_cols : list, optional A list of specific columns to apply this technique to., by default [] keep_col : bool, optional True if you want to keep the columns passed, otherwise remove it. algo_kwargs : dict, optional Parameters you would pass into Bag of Words constructor as a dictionary., by default {} Returns ------- Dataframe, *Dataframe Transformed dataframe with the new column. Returns 2 Dataframes if x_test is provided. """ enc = CountVectorizer(**algo_kwargs) list_of_cols = _get_columns(list_of_cols, x_train) for col in list_of_cols: enc_data = enc.fit_transform(x_train[col]).toarray() enc_df = pd.DataFrame(enc_data, columns=enc.get_feature_names()) x_train = drop_replace_columns(x_train, col, enc_df, keep_col) if x_test is not None: enc_x_test = enc.transform(x_test[col]).toarray() enc_test_df = pd.DataFrame(enc_x_test, columns=enc.get_features_names()) x_test = drop_replace_columns(x_test, col, enc_test_df, keep_col) return x_train, x_test
def feature_tfidf(x_train, x_test=None, list_of_cols=[], keep_col=True, **algo_kwargs): """ Creates a matrix of the tf-idf score for every word in the corpus as it pertains to each document. Either the full data or training data plus testing data MUST be provided, not both. Parameters ---------- x_train : DataFrame Dataset x_test : DataFrame Testing dataset, by default None list_of_cols : list, optional A list of specific columns to apply this technique to, by default [] keep_col : bool, optional True if you want to keep the columns passed, otherwise remove it. algo_kwargs : optional Parameters you would pass into TFIDF constructor, by default {} Returns ------- Dataframe, *Dataframe Transformed dataframe with the new column Returns 2 Dataframes if x_test data is provided. """ enc = TfidfVectorizer(**algo_kwargs) list_of_cols = _get_columns(list_of_cols, x_train) for col in list_of_cols: enc_data = enc.fit_transform(x_train[col]).toarray() enc_df = pd.DataFrame(enc_data, columns=enc.get_feature_names()) x_train = drop_replace_columns(x_train, col, enc_df, keep_col) if x_test is not None: enc_x_test = enc.transform(x_test[col]).toarray() enc_test_df = pd.DataFrame(enc_x_test, columns=enc.get_feature_names()) x_test = drop_replace_columns(x_test, col, enc_test_df, keep_col) return x_train, x_test
def nltk_feature_postag( x_train, x_test=None, list_of_cols=[], new_col_name="_postagged" ): """ Part of Speech tag the text data provided. Used to tag each word as a Noun, Adjective, Verbs, etc. This utilizes TextBlob which utlizes the NLTK tagger and is a wrapper for the tagging process. Parameters ---------- x_train : DataFrame Dataset x_test : DataFrame Testing dataset, by default None list_of_cols : list, optional A list of specific columns to apply this technique to, by default [] new_col_name : str, optional New column name to be created when applying this technique, by default `COLUMN_postagged` Returns ------- Dataframe, *Dataframe Transformed dataframe with the new column. Returns 2 Dataframes if x_test is provided. """ list_of_cols = _get_columns(list_of_cols, x_train) for col in list_of_cols: x_train[col + new_col_name] = pd.Series( map(lambda x: TextBlob(x).tags, x_train[col]) ) if x_test is not None: x_test[col + new_col_name] = pd.Series( map(lambda x: TextBlob(x).tags, x_test[col]) ) return x_train, x_test
def nltk_feature_noun_phrases( x_train, x_test=None, list_of_cols=[], new_col_name="_phrases" ): """ Extracts noun phrases from the given text. This utilizes TextBlob which utlizes the NLTK NLP engine. Parameters ---------- x_train : DataFrame Dataset x_test : DataFrame Testing dataset, by default None list_of_cols : list, optional A list of specific columns to apply this technique to, by default [] new_col_name : str, optional New column name to be created when applying this technique, by default `COLUMN_phrases` Returns ------- Dataframe, *Dataframe Transformed dataframe with the new column. Returns 2 Dataframes if x_test is provided. """ list_of_cols = _get_columns(list_of_cols, x_train) for col in list_of_cols: x_train[col + new_col_name] = pd.Series( map(lambda x: TextBlob(x).noun_phrases, x_train[col]) ) if x_test is not None: x_test[col + new_col_name] = pd.Series( map(lambda x: TextBlob(x).noun_phrases, x_test[col]) ) return x_train, x_test
def replace_missing_new_category( x_train, x_test=None, col_to_category=None, constant=None ): """ Replaces missing values in categorical column with its own category. The categories can be autochosen from the defaults set. Parameters ---------- x_train : DataFrame Dataset x_test : DataFrame Testing Dataset, by default None col_to_category : list or dict, optional A dictionary mapping column name to the category name you want to replace , by default None constant : str, int or float, optional Category placeholder value for missing values, by default None Returns ------- Dataframe, *Dataframe: Cleaned columns of the Dataframe(s) provides with the provided constant. Returns 2 Dataframes if x_test is provided. Examples -------- >>> ReplaceMissingCategory({'a': "Green", 'b': "Canada", 'c': "December"}) >>> ReplaceMissingCategory("Blue", ['a', 'b', 'c']) """ if isinstance(col_to_category, list): col_to_category = _get_columns(col_to_category, x_train) str_missing_categories = ["Other", "Unknown", "Missingx_trainCategory"] num_missing_categories = [-1, -999, -9999] if isinstance(col_to_category, dict): for col in col_to_category.keys(): x_train[col].fillna(col_to_category[col], inplace=True) if x_test is not None: x_test[col].fillna(col_to_category[col], inplace=True) elif isinstance(col_to_category, list) and constant is not None: for col in col_to_category: x_train[col].fillna(constant, inplace=True) if x_test is not None: x_test[col].fillna(constant, inplace=True) else: for col in col_to_category: # Check if column is a number if np.issubdtype(x_train[col].dtype, np.number): new_category_name = _determine_default_category( x_train, col, num_missing_categories ) x_train[col].fillna(new_category_name, inplace=True) # Convert numeric categorical column to integer x_train[col] = x_train[col].astype(int) if x_test is not None: x_test[col].fillna(new_category_name, inplace=True) # Convert numeric categorical column to integer x_test[col] = x_test[col].astype(int) else: new_category_name = _determine_default_category( x_train, col, str_missing_categories ) x_train[col].fillna(new_category_name, inplace=True) if x_test is not None: new_category_name = _determine_default_category( x_train, col, str_missing_categories ) x_test[col].fillna(new_category_name, inplace=True) return x_train, x_test
def groupby_analysis(self, groupby: list, *cols, data_filter=None): """ Groups your data and then provides descriptive statistics for the other columns on the grouped data. For numeric data, the descriptive statistics are: - count - min - max - mean - standard deviation - variance - median - most common - sum - Median absolute deviation - number of unique values For other types of data: - count - most common - number of unique values Parameters ---------- groupby : list List of columns to groupby. cols : str(s) Columns you want statistics on, if none are provided, it will provide statistics for every column. data_filter : Dataframe, optional Filtered dataframe, by default None Returns ------- Dataframe Dataframe of grouped columns and statistics for each column. """ analysis = {} numeric_analysis = [ "count", "min", "max", "mean", "std", "var", "median", ("most_common", lambda x: pd.Series.mode(x)[0]), "sum", "mad", "nunique", ] other_analysis = [ "count", ("most_common", lambda x: pd.Series.mode(x)[0]), "nunique", ] list_of_cols = _get_columns(list(cols), self._data_properties.x_train) if isinstance(data_filter, pd.DataFrame): data = data_filter else: data = self._data_properties.x_train.copy() for col in list_of_cols: if col not in groupby: # biufc - bool, int, unsigned, float, complex if data[col].dtype.kind in "biufc": analysis[col] = numeric_analysis else: analysis[col] = other_analysis analyzed_data = data.groupby(groupby).agg(analysis) return analyzed_data