-
Notifications
You must be signed in to change notification settings - Fork 0
/
bag_of_words.py
86 lines (71 loc) · 2.6 KB
/
bag_of_words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
from pyspark import SparkConf, SparkContext,SQLContext
# SparkContext.setSystemProperty("hadoop.home.dir", "C:\\bigdata\\spark-1.5.1-bin-hadoop2.6\\")
import sys, operator
import json
import string
import re
from pyspark.sql.types import StructType, StructField, StringType, FloatType
import nltk
from pyspark.mllib.feature import Word2Vec
from pyspark.mllib.clustering import KMeans, KMeansModel
from numpy import array
from math import sqrt
import pickle
from pyspark.mllib.linalg import Vectors
# path to the nltk data directory.
# nltk.data.path.append("C:\\Users\\Dell\\Desktop\\bd-inputs\\nltk_data")
nltk.data.path.append("/cs/vml2/avahdat/CMPT733_Data_Sets/Assignment3/nltk_data")
clean_list = []
word_vectors = []
final_list = []
def clean_words(line):
s = re.sub(r'[^\w\s]',' ',line)
fin_s=s.lower()
clean_list=re.sub(' +', ' ',fin_s).strip().split(' ')
return clean_list
def get_sparseVector(x):
ids=[]
for j in x:
if j in cluster.keys():
ids.append(cluster[j])
bag_words = {}
for i in ids:
bag_words[i]=(float(ids.count(i))/len(ids))
# Create a SparseVector
sv = Vectors.sparse(2000, bag_words)
return sv
input = sys.argv[1]
output = sys.argv[2]
conf = SparkConf().setAppName('bag_words')
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
with open('clusterFinal.pickle', 'rb') as f:
cluster=pickle.load(f)
schema = StructType([
StructField('reviewText', StringType(), False),StructField('overall', FloatType(), False),StructField('reviewTime', StringType(), False)
])
df = sqlContext.read.json(input, schema=schema)
df.registerTempTable('review_table')
sd=sqlContext.sql("""
SELECT reviewText FROM review_table
""")
fin=sd.rdd.map(lambda x: str(x.reviewText)).map(clean_words)
sparse_vectors=fin.map(get_sparseVector)
time=sqlContext.sql("""
SELECT reviewTime FROM review_table
""")
time_split=time.rdd.map(lambda x: str(x.reviewTime)).map(lambda line: line.split(', '))
year_list=time_split.map(lambda (x,y):y).collect()
score=sqlContext.sql("""
SELECT overall FROM review_table
""")
score_list=score.rdd.map(lambda x:str(x.overall)).collect()
sparse_list=sparse_vectors.collect()
zip_list=zip(sparse_list, year_list, score_list)
zip_rdd=sc.parallelize(zip_list)
zip_train=zip_rdd.filter(lambda (x,y,z): y!= '2014').map(lambda (x,y,z):(x,z)).coalesce(1)
zip_test=zip_rdd.filter(lambda (x,y,z): y == '2014').map(lambda (x,y,z):(x,z)).coalesce(1)
# zip_train.saveAsPickleFile(output+"/bow_train")
# zip_test.saveAsPickleFile(output+"/bow_test")
zip_train.saveAsPickleFile(output+"/bow_train_small")
zip_test.saveAsPickleFile(output+"/bow_test_small")