예제 #1
0
 def test_ngram(self):
     dataset = self.spark.createDataFrame([Row(input=["a", "b", "c", "d", "e"])])
     ngram0 = NGram(n=4, inputCol="input", outputCol="output")
     self.assertEqual(ngram0.getN(), 4)
     self.assertEqual(ngram0.getInputCol(), "input")
     self.assertEqual(ngram0.getOutputCol(), "output")
     transformedDF = ngram0.transform(dataset)
     self.assertEqual(transformedDF.head().output, ["a b c d", "b c d e"])
예제 #2
0
 def test_ngram(self):
     dataset = self.spark.createDataFrame([
         Row(input=["a", "b", "c", "d", "e"])])
     ngram0 = NGram(n=4, inputCol="input", outputCol="output")
     self.assertEqual(ngram0.getN(), 4)
     self.assertEqual(ngram0.getInputCol(), "input")
     self.assertEqual(ngram0.getOutputCol(), "output")
     transformedDF = ngram0.transform(dataset)
     self.assertEqual(transformedDF.head().output, ["a b c d", "b c d e"])
예제 #3
0
def transformData(df, parameter):
    '''
    Transformed dataframe based on the parameter
        Input : - parameter
        Output : - transformed dataframe
    '''

    ngram = NGram(n=parameter["n"],
                  inputCol=parameter["inputCol"],
                  outputCol=parameter["outputCol"])

    temp = ''

    if len(ngram.transform(df).head().inputTokens) < ngram.getN():
        print('No element in ' + parameter["outputCol"])
    else:
        temp = ngram.transform(df).show()

    return temp