def cleanData(infoData): spark = infoData.get(pc.SPARK) sentimentCol = infoData.get(pc.SENTIMENTCOLNAME) dataset = infoData.get(pc.DATASET) textProcessing = TextProcessing(sparkSession=spark) dataset = textProcessing.replaceSpecialChar(dataset, sentimentCol) return dataset
def cleanData(self, infoData): datasetPath = infoData.get(pc.SENTIMENTDATASETPATH) sentimentCol = infoData.get(pc.SENTIMENTCOLNAME) spark = infoData.get(pc.SPARK) dataset = spark.read.parquet(datasetPath) textProcessing = TextProcessing(sparkSession=spark) dataset = textProcessing.replaceSpecialChar(dataset, sentimentCol) return dataset
def sentimentAnalysis(self, sentimentInfoData): spark = sentimentInfoData.get(pc.SPARK) sentimentDataset = self.textPreProcessing(sentimentInfoData) # do the oneHot Encoding after that. textProcessing = TextProcessing(sparkSession=spark) sentimentDataset = textProcessing.lemmatization(sentimentDataset,pc.DMXSTOPWORDS) sentimentInfoData.update({pc.COLMTOENCODE: pc.DMXLEMMATIZED, pc.DATASET: sentimentDataset}) #--> colm to be used in oneHot encoding. sentimentInfoData = self.oneHotEncodeData(sentimentInfoData) # using the stopWords colm for now. sentimentInfoData = self.labelIndexing(sentimentInfoData) # after this will get the indexed label sentimentInfoData = self.trainModel(sentimentInfoData)
def stopWordsRemover(self, dataset, colName): stopWordsList = StopWords.stopWordsKNIME sentimentStopWordRemover = StopWordsRemover( inputCol=colName, outputCol=self.dmxStopWords, stopWords=stopWordsList) dataset = sentimentStopWordRemover.transform(dataset) textProcessing = TextProcessing(sparkSession=spark) dataset = textProcessing.stemming(dataset, pc.DMXSTOPWORDS) dataset = textProcessing.ngrams(dataset, pc.DMXSTOPWORDS, 2) dataset = textProcessing.lemmatization(dataset, pc.DMXSTOPWORDS) return dataset
def sentimentAnalysis(self, sentimentInfoData): spark = sentimentInfoData.get(pc.SPARK) datasetPath = sentimentInfoData.get(pc.SENTIMENTDATASETPATH) dataset = spark.read.parquet(datasetPath) dataset = pu.addInternalId(dataset) sentimentInfoData.update({pc.DATASET: dataset}) isNgram = sentimentInfoData.get(pc.ISNGRAM) sentimentDataset = self.textPreProcessing(sentimentInfoData) # do the oneHot Encoding after that. textProcessing = TextProcessing() sentimentDataset = textProcessing.lemmatization(sentimentDataset, pc.DMXSTOPWORDS) # sentimentDataset = textProcessing.sparkLemmatizer(sentimentDataset, pc.DMXSTOPWORDS) if(isNgram): ngramPara = sentimentInfoData.get(pc.NGRAMPARA) sentimentDataset = textProcessing.ngrams(sentimentDataset, pc.DMXLEMMATIZED, ngramPara) # with n-grams modelName = sentimentInfoData.get(pc.MODELSHEETNAME) labelColm = sentimentInfoData.get(pc.LABELCOLM) indexedColm = pc.INDEXED_ + labelColm encodedColm = pc.ONEHOTENCODED_ + pc.DMXLEMMATIZED featuresColm = modelName + pc.DMXFEATURE sentimentInfoData.update({pc.COLMTOENCODE: pc.DMXLEMMATIZED, pc.DATASET: sentimentDataset, pc.COLMTOINDEX: labelColm, pc.INDEXEDCOLM: indexedColm, pc.ENCODEDCOLM: encodedColm, pc.COLMTOVECTORIZED: encodedColm, pc.FEATURESCOLM: featuresColm, pc.ORIGINALCOLMNAME: labelColm }) sentimentInfoData = pu.stringIndexer(sentimentInfoData) # after this will get the indexed label sentimentInfoData = pu.countVectorizer(sentimentInfoData) # using the lemmatized colm for now. if(isNgram): sentimentInfoData.update({ pc.COLMTOENCODE: pc.DMXNGRAMS, pc.ENCODEDCOLM: pc.ONEHOTENCODED_ + pc.DMXNGRAMS }) sentimentInfoData = pu.countVectorizer(sentimentInfoData) sentimentInfoData.update({pc.COLMTOVECTORIZED: [pc.ONEHOTENCODED_ + pc.DMXLEMMATIZED, pc.ONEHOTENCODED_ + pc.DMXNGRAMS]}) sentimentInfoData = pu.featureAssembler(sentimentInfoData) # creating feature vector return sentimentInfoData
def sentimentAnalysis(self, sentimentInfoData): '''requirement-- 1.- sentiment sentence containing colmName- 2.- sentiment dataset parquet path 3.- positive dictionary parquet path 4.- negative dictionary parquet path 5.- positive and negative dictionary colm name containing words/sentiment. ''' sparkSession = sentimentInfoData.get(pc.SPARK) global spark spark = sparkSession datasetPath = sentimentInfoData.get(pc.SENTIMENTDATASETPATH) dataset = spark.read.parquet(datasetPath) dataset = pu.addInternalId(dataset) sentimentInfoData.update({pc.DATASET: dataset}) sentimentDataset = self.textPreProcessing(sentimentInfoData) textProcessing = TextProcessing() posNegDataset = textProcessing.mergePosNegDataset(sentimentInfoData) dataset = self.addTag(sentimentDataset, pc.DMXSTEMMEDWORDS, posNegDataset)
def prediction(self, infoData): isNgram = False if infoData.get(pc.ISNGRAM) == None else infoData.get( pc.ISNGRAM) predictionColm = infoData.get(pc.PREDICTIONCOLM) algoName = infoData.get(pc.ALGORITHMNAME) modelStorageLocation = infoData.get(pc.MODELSTORAGELOCATION) spark = infoData.get(pc.SPARK) datasetPath = infoData.get(pc.SENTIMENTDATASETPATH) originalDataset = spark.read.parquet(datasetPath) originalDataset = pu.addInternalId(originalDataset) infoData.update({pc.DATASET: originalDataset}) infoData = self.dataTransformation(infoData) dataset = infoData.get(pc.DATASET) if (isNgram): """sahil-- handle the none value for ngram parameter at the time of data creation""" textProcessing = TextProcessing() ngramPara = infoData.get(pc.NGRAMPARA) dataset = textProcessing.ngrams(dataset, pc.DMXLEMMATIZED, ngramPara) """ -- sahil- hardCoding the algorithm name for comparision handle this while finalising """ if ("GradientBoostClassifier".__eq__(algoName)): predictionModel = GBTClassificationModel.load(modelStorageLocation) if ("DecisionTreeClassifier".__eq__(algoName)): predictionModel = DecisionTreeClassificationModel.load( modelStorageLocation) dataset = dataset.drop(predictionColm) originalDataset = originalDataset.drop(predictionColm) dataset = predictionModel.transform(dataset) """calling indexToString method after the prediction""" infoData.update({pc.DATASET: dataset}) infoData = self.invertIndex(infoData) dataset = infoData.get(pc.DATASET) dataset = dataset.select(pc.DMXINDEX, predictionColm) finalDataset = pu.joinDataset(originalDataset, dataset, pc.DMXINDEX) return finalDataset
def textPreProcessing(self, sentimentInfoData): sentimentColName = sentimentInfoData.get(pc.SENTIMENTCOLNAME) dataset = sentimentInfoData.get(pc.DATASET) textProcessing = TextProcessing() dataset = textProcessing.toStringDatatype(dataset, sentimentColName) dataset = textProcessing.replaceSpecialChar(dataset, sentimentColName) dataset = textProcessing.createToken(dataset, sentimentColName) dataset = textProcessing.stopWordsRemover(dataset, pc.DMXTOKENIZED) dataset = textProcessing.lemmatization(dataset, pc.DMXSTOPWORDS) return dataset
def textPreProcessing(self, sentimentInfoData): sentimentColName = sentimentInfoData.get(pc.SENTIMENTCOLNAME) dataset = sentimentInfoData.get(pc.DATASET) lemmatizedModelPath = sentimentInfoData.get( pc.LEMMATIZEDPRETRAINEDMODEL) textProcessing = TextProcessing(sparkSession=spark) dataset = textProcessing.toStringDatatype(dataset, sentimentColName) dataset = textProcessing.replaceSpecialChar(dataset, sentimentColName) dataset = textProcessing.createToken(dataset, sentimentColName) dataset = textProcessing.stopWordsRemover(dataset, pc.DMXTOKENIZED) dataset = textProcessing.sparkLemmatizer(dataset, pc.DMXSTOPWORDS, lemmatizedModelPath) return dataset