def decision_tree(df, columns, input_col, **kargs): """ Runs a decision tree classifier for input DataFrame. :param df: Pyspark dataframe to analyze. :param columns: List of columns to select for prediction. :param input_col: Column to predict. :return: DataFrame with decision tree and prediction run. """ if not is_dataframe(df): raise TypeError("Spark dataframe expected") columns = parse_columns(df, columns) assert isinstance(input_col, str), "Error, input column must be a string" data = df.select(columns) feats = data.columns feats.remove(input_col) df = string_to_index(df, input_cols=input_col) df = vector_assembler(df, input_cols=feats) model = DecisionTreeClassifier(**kargs) df = df.cols.rename([(input_col + "_index", "label")]) dt_model = model.fit(df) df_model = dt_model.transform(df) return df_model, dt_model
def _iqr(self, action): """ Select or drop outliers :param action: :return: """ df = self.df columns = self.columns if not is_dataframe(self.df): raise TypeError("Spark Dataframe expected") columns = parse_columns(self.df, columns) for col_name in columns: iqr = df.cols.iqr(col_name, more=True) lower_bound = iqr["q1"] - (iqr["iqr"] * 1.5) upper_bound = iqr["q3"] + (iqr["iqr"] * 1.5) if action is "drop": df = df.rows.drop((F.col(col_name) > upper_bound) | (F.col(col_name) < lower_bound)) elif action is "select": df = df.rows.select((F.col(col_name) > upper_bound) | (F.col(col_name) < lower_bound)) return df
def z_score(df, columns, threshold=None): """ Delete outlier using z score :param df: :param columns: :param threshold: :return: """ if not is_dataframe(df): raise TypeError("Spark Dataframe expected") if not is_int(threshold): raise TypeError("Integer expected") columns = parse_columns(df, columns) for c in columns: # the column with the z_col value is always the string z_col plus the name of column z_col = "z_col_" + c df = df.cols.z_score(c) \ .rows.drop(F.col(z_col) > threshold) \ .cols.drop(z_col) return df
def _z_score(self, action): """ Get outlier using z score :return: """ df = self.df columns = self.columns threshold = self.threshold if not is_dataframe(df): raise TypeError("Spark Dataframe expected") if not is_numeric(threshold): raise TypeError("Numeric expected") columns = parse_columns(df, columns) for col_name in columns: # the column with the z_col value is always the string z_col plus the name of column z_col_name = _z_score_col_name(col_name) if action is "drop": df = df.cols.z_score(col_name,z_col_name) \ .rows.drop(F.col(z_col_name) > threshold) \ .cols.drop(z_col_name) elif action is "select": df = df.cols.z_score(col_name) \ .rows.select(F.col(z_col_name) > threshold) \ .cols.drop(z_col_name) return df
def _mad(self, action): """ :type action: :return: """ df = self.df columns = self.columns threshold = self.threshold if not is_dataframe(df): raise TypeError("Spark Dataframe expected") if not is_int(threshold): raise TypeError("Integer expected") columns = parse_columns(df, columns) for c in columns: mad_value = df.cols.mad(c, more=True) lower_bound = mad_value["median"] - threshold * mad_value["mad"] upper_bound = mad_value["median"] + threshold * mad_value["mad"] if action is "select": df = df.rows.select((F.col(c) > upper_bound) | (F.col(c) < lower_bound)) elif action is "drop": df = df.rows.drop((F.col(c) > upper_bound) | (F.col(c) < lower_bound)) return df
def gbt(df, columns, input_col, **kargs): """ Runs a gradient boosting tree classifier for input DataFrame. :param df: Pyspark dataframe to analyze. :param columns: List of columns to select for prediction. :param input_col: Column to predict. :return: DataFrame with gradient boosting tree and prediction run. """ if not is_dataframe(df): raise TypeError("Spark dataframe expected") columns = parse_columns(df, columns) if not is_str(input_col): raise TypeError("Error, input column must be a string") data = df.select(columns) feats = data.columns feats.remove(input_col) df = string_to_index(df, input_cols=input_col) df = vector_assembler(df, input_cols=feats) model = GBTClassifier(**kargs) df = df.cols.rename([(input_col + "_index", "label")]) gbt_model = model.fit(df) df_model = gbt_model.transform(df) return df_model, gbt_model
def n_gram(df, input_col, n=2): """ Converts the input array of strings inside of a Spark DF into an array of n-grams. :param df: Pyspark dataframe to analyze :param input_col: Column to analyzer. :param n: number of elements per n-gram >=1. :return: Spark DataFrame with n-grams calculated. """ is_dataframe(df) tokenizer = feature.Tokenizer().setInputCol( input_col) | feature.StopWordsRemover() count = feature.CountVectorizer() gram = feature.NGram(n=n) | feature.CountVectorizer() tf = tokenizer | (count, gram) | feature.VectorAssembler() tfidf = tf | feature.IDF().setOutputCol('features') tfidf_model = tfidf.fit(df) df_model = tfidf_model.transform(df) return df_model, tfidf_model
def logistic_regression_text(df, input_col): """ Runs a logistic regression for input (text) DataFrame. :param df: Pyspark dataframe to analyze :param input_col: Column to predict :return: DataFrame with logistic regression and prediction run. """ if not is_dataframe(df): raise TypeError("Spark dataframe expected") pl = feature.Tokenizer().setInputCol(input_col) | feature.CountVectorizer() ml = pl | classification.LogisticRegression() ml_model = ml.fit(df) df_model = ml_model.transform(df) return df_model, ml_model
def iqr(df, columns): """ Delete outliers using inter quartile range :param df: :param columns: :return: """ if not is_dataframe(df): raise TypeError("Spark Dataframe expected") columns = parse_columns(df, columns) for column in columns: iqr = df.cols.iqr(column, more=True) lower_bound = iqr["q1"] - (iqr["iqr"] * 1.5) upper_bound = iqr["q3"] + (iqr["iqr"] * 1.5) df = df.rows.drop((F.col(column) > upper_bound) | (F.col(column) < lower_bound)) return df
def _modified_z_score(self, action): """ :param action: :return: """ df = self.df columns = self.columns threshold = self.threshold if not is_dataframe(df): raise TypeError("Spark Dataframe expected") if not is_numeric(threshold): raise TypeError("Numeric expected") columns = parse_columns(df, columns) for col_name in columns: median = df.cols.median(col_name) median_absolute_deviation = df.select( F.abs(F.col(col_name) - median).alias(col_name)).cols.median(col_name) m_z_col_name = _m_z_score_col_name(col_name) df = df.withColumn( m_z_col_name, F.abs(0.6745 * (F.col(col_name) - median) / median_absolute_deviation)) if action is "select": df = df.rows.select(F.col(m_z_col_name) > threshold) elif action is "drop": df = df.rows.drop(F.col(m_z_col_name) > threshold) return df
def mad(df, columns, threshold=None): """ Delete outlier using mad :param df: :param columns: :param threshold: :return: """ if not is_dataframe(df): raise TypeError("Spark Dataframe expected") if not is_int(threshold): raise TypeError("Integer expected") columns = parse_columns(df, columns) for c in columns: mad_value = df.cols.mad(c, more=True) lower_bound = mad_value["median"] - threshold * mad_value["mad"] upper_bound = mad_value["median"] + threshold * mad_value["mad"] df = df.rows.drop((F.col(c) > upper_bound) | (F.col(c) < lower_bound)) return df
def run(self, df, func_request=None, func_response=None, return_type="json", calls=60, period=60, max_tries=8): """ Read a the url key from a mongo collection an make a request to a service :param df: Dataframe to me loaded to the enricher collection. :param func_request: help to create a custom request :param func_response: help to create a custom response :param calls: how many call can you make by period of time :param period: in which period ot time can the call be made in seconds :param max_tries: how many retries should we do :param return_type: :return: """ if is_dataframe(df): df = df.create_id(COL_ID) # Load the dataframe data in the enricher self.load(df) collection_name = self.collection_name collection = self.get_collection(collection_name) # Get data that is not yet enriched cursor = collection.find({COL_RESULTS: {"$exists": False}}) total_docs = cursor.count(True) if func_request is None: func_request = requests.get collection = self.get_collection(collection_name) @on_exception(expo, RateLimitException, max_tries=max_tries) @limits(calls=calls, period=period) def _func_request(v): return func_request(v) if total_docs > 0: for c in tqdm_notebook(cursor, total=total_docs, desc='Processing...'): # Send request to the API response = _func_request(c) mongo_id = c["_id"] if response.status_code == 200: if return_type == "json": response = json.loads(response.text) elif return_type == "text": response = response.text # Process the result with an external function if is_function(func_response): response = func_response(response) # Update the mongo id with the result collection.find_and_modify( query={"_id": mongo_id}, update={"$set": { COL_RESULTS: response }}, upsert=False, full_response=True) else: # The response key will remain blank so we can filter it to try in future request logger.print(response.status_code) # Append the data in enrichment to the dataframe logger.print("Appending collection info into the dataframe") # TODO: An elegant way to handle pickling? # take care to the pickling host = self.host port = self.port db_name = self.db_name @pandas_udf('string', PandasUDFType.SCALAR) def func(value): # More about pickling from pymongo import MongoClient _client = MongoClient(host, port) _db = _client[db_name] _collection = _db[collection_name] def func_serie(serie): _cursor = _collection.find_one({COL_ID: serie}, projection={ "_id": 0, COL_RESULTS: 1 }) return _cursor[COL_RESULTS] return value.apply(func_serie) df = df.withColumn(COL_RESULTS, func(df[COL_ID])).cols.drop(COL_ID).run() # If the process is finished, flush the Mongo collection self.flush() return df else: print("No records available to process")