def cleanData(infoData):
     spark = infoData.get(pc.SPARK)
     sentimentCol = infoData.get(pc.SENTIMENTCOLNAME)
     dataset = infoData.get(pc.DATASET)
     textProcessing = TextProcessing(sparkSession=spark)
     dataset = textProcessing.replaceSpecialChar(dataset, sentimentCol)
     return dataset
示例#2
0
 def cleanData(self, infoData):
     datasetPath = infoData.get(pc.SENTIMENTDATASETPATH)
     sentimentCol = infoData.get(pc.SENTIMENTCOLNAME)
     spark = infoData.get(pc.SPARK)
     dataset = spark.read.parquet(datasetPath)
     textProcessing = TextProcessing(sparkSession=spark)
     dataset = textProcessing.replaceSpecialChar(dataset, sentimentCol)
     return dataset
 def sentimentAnalysis(self, sentimentInfoData):
     spark = sentimentInfoData.get(pc.SPARK)
     sentimentDataset = self.textPreProcessing(sentimentInfoData) # do the oneHot Encoding after that.
     textProcessing = TextProcessing(sparkSession=spark)
     sentimentDataset = textProcessing.lemmatization(sentimentDataset,pc.DMXSTOPWORDS)
     sentimentInfoData.update({pc.COLMTOENCODE: pc.DMXLEMMATIZED,
                               pc.DATASET: sentimentDataset}) #--> colm to be used in oneHot encoding.
     sentimentInfoData = self.oneHotEncodeData(sentimentInfoData) # using the stopWords colm for now.
     sentimentInfoData = self.labelIndexing(sentimentInfoData) # after this will get the indexed label
     sentimentInfoData = self.trainModel(sentimentInfoData)
示例#4
0
 def stopWordsRemover(self, dataset, colName):
     stopWordsList = StopWords.stopWordsKNIME
     sentimentStopWordRemover = StopWordsRemover(
         inputCol=colName,
         outputCol=self.dmxStopWords,
         stopWords=stopWordsList)
     dataset = sentimentStopWordRemover.transform(dataset)
     textProcessing = TextProcessing(sparkSession=spark)
     dataset = textProcessing.stemming(dataset, pc.DMXSTOPWORDS)
     dataset = textProcessing.ngrams(dataset, pc.DMXSTOPWORDS, 2)
     dataset = textProcessing.lemmatization(dataset, pc.DMXSTOPWORDS)
     return dataset
示例#5
0
    def sentimentAnalysis(self, sentimentInfoData):

        spark = sentimentInfoData.get(pc.SPARK)
        datasetPath = sentimentInfoData.get(pc.SENTIMENTDATASETPATH)
        dataset = spark.read.parquet(datasetPath)
        dataset = pu.addInternalId(dataset)
        sentimentInfoData.update({pc.DATASET: dataset})

        isNgram = sentimentInfoData.get(pc.ISNGRAM)
        sentimentDataset = self.textPreProcessing(sentimentInfoData)  # do the oneHot Encoding after that.
        textProcessing = TextProcessing()
        sentimentDataset = textProcessing.lemmatization(sentimentDataset, pc.DMXSTOPWORDS)
        # sentimentDataset = textProcessing.sparkLemmatizer(sentimentDataset, pc.DMXSTOPWORDS)
        if(isNgram):
            ngramPara = sentimentInfoData.get(pc.NGRAMPARA)
            sentimentDataset = textProcessing.ngrams(sentimentDataset, pc.DMXLEMMATIZED, ngramPara)  # with n-grams

        modelName = sentimentInfoData.get(pc.MODELSHEETNAME)
        labelColm = sentimentInfoData.get(pc.LABELCOLM)
        indexedColm = pc.INDEXED_ + labelColm
        encodedColm = pc.ONEHOTENCODED_ + pc.DMXLEMMATIZED
        featuresColm = modelName + pc.DMXFEATURE

        sentimentInfoData.update({pc.COLMTOENCODE: pc.DMXLEMMATIZED,
                                  pc.DATASET: sentimentDataset,
                                  pc.COLMTOINDEX: labelColm,
                                  pc.INDEXEDCOLM: indexedColm,
                                  pc.ENCODEDCOLM: encodedColm,
                                  pc.COLMTOVECTORIZED: encodedColm,
                                  pc.FEATURESCOLM: featuresColm,
                                  pc.ORIGINALCOLMNAME: labelColm
                                  })

        sentimentInfoData = pu.stringIndexer(sentimentInfoData)  # after this will get the indexed label
        sentimentInfoData = pu.countVectorizer(sentimentInfoData)  # using the lemmatized colm for now.
        if(isNgram):
            sentimentInfoData.update({
                pc.COLMTOENCODE: pc.DMXNGRAMS,
                pc.ENCODEDCOLM: pc.ONEHOTENCODED_ + pc.DMXNGRAMS
            })
            sentimentInfoData = pu.countVectorizer(sentimentInfoData)
            sentimentInfoData.update({pc.COLMTOVECTORIZED: [pc.ONEHOTENCODED_ + pc.DMXLEMMATIZED, pc.ONEHOTENCODED_ + pc.DMXNGRAMS]})

        sentimentInfoData = pu.featureAssembler(sentimentInfoData)  # creating feature vector

        return sentimentInfoData
    def sentimentAnalysis(self, sentimentInfoData):
        '''requirement--
        1.- sentiment sentence containing colmName-
        2.- sentiment dataset parquet path
        3.- positive dictionary parquet path
        4.- negative dictionary parquet path
        5.- positive and negative dictionary colm name containing words/sentiment. '''
        sparkSession = sentimentInfoData.get(pc.SPARK)
        global spark
        spark = sparkSession

        datasetPath = sentimentInfoData.get(pc.SENTIMENTDATASETPATH)
        dataset = spark.read.parquet(datasetPath)
        dataset = pu.addInternalId(dataset)
        sentimentInfoData.update({pc.DATASET: dataset})
        sentimentDataset = self.textPreProcessing(sentimentInfoData)
        textProcessing = TextProcessing()
        posNegDataset = textProcessing.mergePosNegDataset(sentimentInfoData)
        dataset = self.addTag(sentimentDataset, pc.DMXSTEMMEDWORDS,
                              posNegDataset)
示例#7
0
    def prediction(self, infoData):
        isNgram = False if infoData.get(pc.ISNGRAM) == None else infoData.get(
            pc.ISNGRAM)
        predictionColm = infoData.get(pc.PREDICTIONCOLM)
        algoName = infoData.get(pc.ALGORITHMNAME)
        modelStorageLocation = infoData.get(pc.MODELSTORAGELOCATION)
        spark = infoData.get(pc.SPARK)
        datasetPath = infoData.get(pc.SENTIMENTDATASETPATH)
        originalDataset = spark.read.parquet(datasetPath)
        originalDataset = pu.addInternalId(originalDataset)
        infoData.update({pc.DATASET: originalDataset})

        infoData = self.dataTransformation(infoData)

        dataset = infoData.get(pc.DATASET)
        if (isNgram):
            """sahil-- handle the none value for ngram parameter at the time of data creation"""
            textProcessing = TextProcessing()
            ngramPara = infoData.get(pc.NGRAMPARA)
            dataset = textProcessing.ngrams(dataset, pc.DMXLEMMATIZED,
                                            ngramPara)
        """
        -- sahil- hardCoding the algorithm name for comparision handle this while finalising
        """
        if ("GradientBoostClassifier".__eq__(algoName)):
            predictionModel = GBTClassificationModel.load(modelStorageLocation)
        if ("DecisionTreeClassifier".__eq__(algoName)):
            predictionModel = DecisionTreeClassificationModel.load(
                modelStorageLocation)

        dataset = dataset.drop(predictionColm)
        originalDataset = originalDataset.drop(predictionColm)
        dataset = predictionModel.transform(dataset)
        """calling indexToString method after the prediction"""
        infoData.update({pc.DATASET: dataset})
        infoData = self.invertIndex(infoData)

        dataset = infoData.get(pc.DATASET)
        dataset = dataset.select(pc.DMXINDEX, predictionColm)
        finalDataset = pu.joinDataset(originalDataset, dataset, pc.DMXINDEX)
        return finalDataset
示例#8
0
    def textPreProcessing(self, sentimentInfoData):
        sentimentColName = sentimentInfoData.get(pc.SENTIMENTCOLNAME)
        dataset = sentimentInfoData.get(pc.DATASET)

        textProcessing = TextProcessing()
        dataset = textProcessing.toStringDatatype(dataset, sentimentColName)
        dataset = textProcessing.replaceSpecialChar(dataset, sentimentColName)
        dataset = textProcessing.createToken(dataset, sentimentColName)
        dataset = textProcessing.stopWordsRemover(dataset, pc.DMXTOKENIZED)
        dataset = textProcessing.lemmatization(dataset, pc.DMXSTOPWORDS)

        return dataset
    def textPreProcessing(self, sentimentInfoData):
        sentimentColName = sentimentInfoData.get(pc.SENTIMENTCOLNAME)
        dataset = sentimentInfoData.get(pc.DATASET)
        lemmatizedModelPath = sentimentInfoData.get(
            pc.LEMMATIZEDPRETRAINEDMODEL)

        textProcessing = TextProcessing(sparkSession=spark)
        dataset = textProcessing.toStringDatatype(dataset, sentimentColName)
        dataset = textProcessing.replaceSpecialChar(dataset, sentimentColName)
        dataset = textProcessing.createToken(dataset, sentimentColName)
        dataset = textProcessing.stopWordsRemover(dataset, pc.DMXTOKENIZED)
        dataset = textProcessing.sparkLemmatizer(dataset, pc.DMXSTOPWORDS,
                                                 lemmatizedModelPath)

        return dataset