-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
235 lines (175 loc) · 6.28 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
from stop_words import stop_words
from itertools import permutations
from itertools import islice
import math
import base64
from pyspark.sql import SparkSession
import sys
from sys import argv
import argparse
from pyspark.sql.context import SQLContext
import networkx as nx
import os
import graphframes
def initializeSpark():
spark = SparkSession.builder.master(
"local[*]").appName("TDT4305 - Project Part 2").getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("WARN")
return sc
def make_rdds_from_dir(directory, sc):
rdds = {}
print(
"\n\nCreating RDD's from directory...\n")
for filename in os.listdir(directory):
if filename.endswith(".csv") or filename.endswith(".csv.gz"):
data_file_name = filename.split('.', 1)[0]
rdd = sc.textFile(directory + '/' +
filename).map(lambda line: line.split("\t"))
rdd = rdd.mapPartitionsWithIndex(
lambda a, b: islice(b, 1, None) if a == 0 else b
)
rdds[data_file_name + "_rdd"] = rdd
if(len(rdds) == 0):
print("\nThere were no CVS files in given directory.\n")
return rdds
print("Found " + str(len(rdds)) + " CVS files\n")
return rdds
def parse_pls():
parser = argparse.ArgumentParser()
parser.add_argument("--input_path")
parser.add_argument("--post_id")
args = vars(parser.parse_args())
directory = args["input_path"]
post_id = args["post_id"]
return directory, post_id
def make_stripped_string(post_rdd, post_id):
post = post_rdd.filter(lambda line: line[0] == post_id)
# Maps the 6th column to base64
body = post.map(lambda post: base64.b64decode(post[5])).take(1)
# Convert to lowercase string
lowercase = str(body).lower()
# Removes first three and last 2 characters
lowercase = lowercase[3:][:-2]
# Removes 
 (newline), paragraph and bold tags
lowercase = lowercase.replace("
", "")
lowercase = lowercase.replace("<p>", "")
lowercase = lowercase.replace("</p>", "")
lowercase = lowercase.replace("<b>", "")
lowercase = lowercase.replace("</b>", "")
# Removes punctuations
punctuations_to_remove = "!,:;?"
doc = lowercase
for punctuation in punctuations_to_remove:
doc = doc.replace(punctuation, "")
# First replace DOT with whitespace (using whitespace to separate words in beginning and end of sentences)
# This is done instead of removing DOT from tokens later, as doing this will not fix the problem of words in the
# end and start of sentences becoming one token. (An example of this would be 'analyzed.my' on post_id = 14)
doc = doc.replace(".", " ")
# Removes whitespaces and TAB charaters (\t)
a_list = doc.split()
doc = " ".join(a_list)
filtered = doc
# Using triple qotes to include qotation marks in the string
special_chars = """"'#$%&<=>@~()*+-/[]^_`{|}"""
for character in special_chars:
filtered = filtered.replace(character, "")
return filtered
def tokenize(string):
# Tokenizes the string
tokens = string.split(" ")
# Removes tokens with wordlenght < 3
new_tokens = []
for token in tokens:
if len(token) > 2:
new_tokens.append(token)
# Remove stopwords
sw = stop_words()
word_tokens = new_tokens
filtered_tokens = []
for w in word_tokens:
if w not in sw:
filtered_tokens.append(w)
return filtered_tokens
def remove_dupes(tokens):
unique = []
for i in tokens:
if i not in unique:
unique.append(i)
return unique
def assign_id_to_list(input):
tokens = []
for id, word in enumerate(input):
tup = (id, word)
tokens.append(tup)
return tokens
# Finds the the edges within the given window size
def create_edges(arr, window_size):
temp = []
edges = []
for element in arr:
if len(temp) == window_size:
# Creates tuples for all the relationships within the window size
relations = permutations(temp, 2)
for r in relations:
# Removes reflexive relationships
if r[0] != r[1]:
edges.append(r)
# Remove first element from temporary list
temp.pop(0)
temp.append(element)
return edges
def get_id(tuples, token):
for id, i in tuples:
if i == token:
return id
return -1
def assign_unique_ids(unique_tuple, tokens):
# Iterates through all the tokens and finds the unique id for each token
final = []
for token in tokens:
unique = get_id(unique_tuple, token)
tuple = (unique, token)
final.append(tuple)
return final
def remove_dupe_tuples(lst):
return [t for t in (set(tuple(i) for i in lst))]
def main():
sc = initializeSpark()
spark = SparkSession(sc)
directory, post_id = parse_pls()
rdds = make_rdds_from_dir(directory, sc)
post_rdd = rdds["posts_rdd"]
string = make_stripped_string(post_rdd, post_id)
print("\n Body from post_id: " + str(post_id) +
", stripped of shitespaces and special characters:\n")
print("'" + string + "'\n")
# Tokenize the string
tokens = tokenize(string)
# remove duplicate entries
tokens_unique = remove_dupes(tokens)
# Assign id to the unique tokens
token_id_tuple = assign_id_to_list(tokens_unique)
# Now assign these id's the the original token list
token_id_all = assign_unique_ids(token_id_tuple, tokens)
print("\nTokens retrieved from the body with their respective id's: \n")
for i in token_id_all:
print(i)
print("\n\nEdges:\n")
ids = []
for i in token_id_all:
ids.append(i[0])
# Create edges on a window size of 5, using the ids of the tokens
edges = create_edges(ids, 5)
# Removes duplicate edges from list
edges = remove_dupe_tuples(edges)
print(edges)
print("\n\nPageRank:")
sqlContext = SQLContext(sc)
v = sqlContext.createDataFrame(token_id_tuple, ["id", "word"])
e = sqlContext.createDataFrame(edges, ["src", "dst"])
g = graphframes.GraphFrame(v, e)
results = g.pageRank(resetProbability=0.15, tol=0.0001)
results.vertices.select("word", "pagerank").show(truncate=False)
if __name__ == "__main__":
main()