import sparknlp
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.sql.functions import udf, col
from pyspark.sql.types import IntegerType, StringType
from sparknlp.pretrained import PretrainedPipeline

# 1. Setup
sparknlp.start()
conf = SparkConf().setAppName('parallel-project')
sc = SparkContext.getOrCreate()
spark = SQLContext(sc)

pipeline = PretrainedPipeline('analyze_sentiment', 'en')

# 2. Data Cleansing
# read in data to a DataFrame
comments = spark.read.json('RC_2019-02-28-one-day')
# dummy_data = [["Hello, world!", "/r/soccer"], ["Wow. Simply wow. What an unbelievable pass, inch perfect.", "/r/nba"]]
# comments = sc.parallelize(dummy_data).toDF(['body', 'subreddit'])
comments.printSchema

# Rename 'body' to 'text' for spark-nlp
comments = comments.withColumnRenamed('body', 'text')

# keep only the columns we're interested in
commentsCleaned = comments.select('subreddit', 'text')

# Filter out bad comment data
commentsCleaned = commentsCleaned.filter(commentsCleaned.text != '[deleted]')\
                                 .filter(commentsCleaned.text != '[removed]')\
Пример #2
0
! apt-get update -qq
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

# Install pyspark
! pip install --ignore-installed pyspark==2.4.4

# Install Spark NLP
! pip install --ignore-installed spark-nlp==2.5.0

import sparknlp 

spark = sparknlp.start()

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

from google.colab import drive
drive.mount('/content/drive')

Dataset = spark.read.option("header", True).csv('drive/My Drive/bbc-text.csv')

Dataset.show(10)

df_train, df_test = Dataset.randomSplit([.7, .3])

df_train.show(5)
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import base64

from sklearn.metrics import classification_report

# import findspark
# findspark.init()

import sparknlp

spark = sparknlp.start(gpu=True)

import pyspark
from pyspark.ml import Pipeline
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *
from pyspark.sql.types import StringType
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from sparknlp.pretrained import PretrainedPipeline

import json
import time
import warnings
import os

warnings.filterwarnings("ignore")
Пример #4
0
def get_sample_sdf():
    nlu.spark = sparknlp.start()
    nlu.spark_started = True
    return nlu.spark.createDataFrame(get_sample_pdf())
Пример #5
0
from PredictionAlgorithms.SentimentAnalysis.SAMachineLearning import SAMachineLearning
from PredictionAlgorithms.PredictiveConstants import PredictiveConstants as pc
from PredictionAlgorithms.PredictiveUtilities import PredictiveUtilities as pu
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.sql import SparkSession
import sparknlp

# sparkTest = \
#     SparkSession.builder.appName('DMXPredictiveAnalytics').master('local[*]').getOrCreate()
# sparkTest.sparkContext.setLogLevel('ERROR')
sparkTest = sparknlp.start()


class SADecisionTreeClassifier(SAMachineLearning):
    def sentimentData(self, sentimentDataInfo):

        sentimentDataInfo = self.sentimentAnalysis(sentimentDataInfo)
        sentimentDataInfo = self.trainModel(sentimentDataInfo)
        sentimentDataInfo = self.invertIndexColm(sentimentDataInfo)
        modelName = sentimentDataInfo.get(pc.MODELSHEETNAME)
        storagePath = sentimentDataInfo.get(pc.STORAGELOCATION)
        jsonStorageLocation = storagePath + modelName
        #--sahil store the data in json format --> write the separate method for this.
        sentimentDataInfo.pop(pc.SPARK, "None")
        sentimentDataInfo.pop(pc.DATASET, "None")
        sentimentDataInfo.pop(pc.TESTDATA, "None")
        sentimentDataInfo.pop(pc.TRAINDATA, "None")
        sentimentDataInfo.pop(pc.MODEL, "None")
        # json.dump(sentimentDataInfo, open(storagePath + modelName + ".json", 'w'))
        pu.writeToJson(jsonStorageLocation, sentimentDataInfo)