def test_ngram(self): dataset = self.spark.createDataFrame([Row(input=["a", "b", "c", "d", "e"])]) ngram0 = NGram(n=4, inputCol="input", outputCol="output") self.assertEqual(ngram0.getN(), 4) self.assertEqual(ngram0.getInputCol(), "input") self.assertEqual(ngram0.getOutputCol(), "output") transformedDF = ngram0.transform(dataset) self.assertEqual(transformedDF.head().output, ["a b c d", "b c d e"])
def test_ngram(self): dataset = self.spark.createDataFrame([ Row(input=["a", "b", "c", "d", "e"])]) ngram0 = NGram(n=4, inputCol="input", outputCol="output") self.assertEqual(ngram0.getN(), 4) self.assertEqual(ngram0.getInputCol(), "input") self.assertEqual(ngram0.getOutputCol(), "output") transformedDF = ngram0.transform(dataset) self.assertEqual(transformedDF.head().output, ["a b c d", "b c d e"])
def transformData(df, parameter): ''' Transformed dataframe based on the parameter Input : - parameter Output : - transformed dataframe ''' ngram = NGram(n=parameter["n"], inputCol=parameter["inputCol"], outputCol=parameter["outputCol"]) temp = '' if len(ngram.transform(df).head().inputTokens) < ngram.getN(): print('No element in ' + parameter["outputCol"]) else: temp = ngram.transform(df).show() return temp