/
twitterParserT.py
122 lines (106 loc) · 5.3 KB
/
twitterParserT.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
## A parser for the twitter data file with all the JSON data. Reads in the JSON data and prints out a matrix of the form:
## 0, word1, word2, word3, ..., wordn
## tweet1, 0, 1, 1, ..., 0
## tweet2, 1, 1, 0, ..., 0
## ..., ..., ..., ..., ..., ...
## tweetm, 0, 1, 0, ..., 0
## where n is the number of unique and relevant words in the whole set of Tweets and m is the number of Tweets that are successfully extracted from the data.
## There are 1's whenever the word of a particular column appears in the Tweet of a particular row, else there is a 0.
## This file can then be used for Sparse PCA.
import json
from bagOfWords import BagOfWords
from dictionaryOfWords import DictionaryOfWords
from Tweet import Tweet
# To test, uncomment the next two lines
#jsonFile = open('/Users/theopavlakou/Documents/Imperial/Fourth_Year/MEng_Project/TWITTER Research/Data (100k tweets from London)/twitteranalysis/testJsons', 'r')
#commonWordsFile = open('/Users/theopavlakou/Documents/Imperial/Fourth_Year/MEng_Project/TWITTER Research/Data (100k tweets from London)/twitteranalysis/commonWords', 'r')
jsonFile = open('/Users/theopavlakou/Documents/Imperial/Fourth_Year/MEng_Project/TWITTER Research/Data (100k tweets from London)/twitteranalysis/xab', 'r')
#jsonFile = open('/Users/theopavlakou/Documents/Imperial/Fourth_Year/MEng_Project/TWITTER Research/Data (100k tweets from London)/twitteranalysis/tweets_London_22Sep12_03Oct12', 'r')
#commonWordsFile = open('/Users/theopavlakou/Documents/Imperial/Fourth_Year/MEng_Project/TWITTER Research/Data (100k tweets from London)/twitterdataanalysis/CommonWordsPlain.txt', 'r')
print("--- Loading Tweets ---")
tweetSet = []
# Make a decoder to decode the JSON string
decoder = json.JSONDecoder()
while True:
try:
# Get all lines in the file
line = jsonFile.readline()
# The lines will contain a non-empty string until the eof
# so we can break the loop when this happens
if line == "":
break
# Translate the JSON string into python JSON representation
jsonObject = decoder.decode(line)
# Make a new tweet and add it to the set of tweets that we have
# TODO
# For some reason this doesn't work when I add the date. Must fix.
#tweet = Tweet(jsonObject["text"], jsonObject["created_at"])
tweet = Tweet(jsonObject["text"])
# print("Tweet content is " + tweet.content + ", and it was created on " + "NOT TODAY")
tweetSet.append(tweet)
# Some of the lines have encoding errors so ignore them
except UnicodeDecodeError:
pass
except:
pass
print("--- Finished loading Tweets ---")
print("--- Printing size of Tweets ---")
# Print the number of tweets
print(" Size: " + str(len(tweetSet)))
print("--- Loading most common words in the Tweets ---")
# Make a list of all the unique words in the tweets which will be the columns of the matrix.
# Should reduce the > 75,000 words we have by about a factor of 10
#bagOfWords = BagOfWords(0.015)
dictOfWords = DictionaryOfWords()
for tweet in tweetSet:
dictOfWords.addFromSet(tweet.listOfWords())
#bagOfWords.addFromSetWithProbability(tweet.listOfWords())
#listOfWords = bagOfWords.getListOfWords()
#listOfWords = dictOfWords.getMostPopularWords(2500)
listOfWords = dictOfWords.getMostPopularWordsAndOccurrences(3000)
print("--- Finished loading most common words in the Tweets ---")
print("--- Printing number of unique and relevant words ---")
print(" Size: " + str(len(listOfWords)))
print("--- Finished printing number of unique and relevant words ---")
# Make a matrix of size (len(tweetSet)+1)x(len(setOfWords)+1). The first row will have the words and the first column will have the tweets (NOT the zeroth).
print("--- Opening file to output result to ---")
fileOut = open("S", "w")
# Make a file that has the words that are indexed by the columns of the matrix with their index to the left.
print("--- Opening file to output index of words to ---")
wordsFile = open("commonWordsIndex", "w")
# Matlab starts indexing from 1
i = 1
for (word, occurrence) in listOfWords:
wordsFile.write(str(i) + " " + word + " " + str(occurrence) + "\n")
i = i + 1
print("--- Closing file to output index of words to ---")
wordsFile.close()
# Make a file that has the tweet content of the tweets that are indexed by the rows of the matrix with their index to the left.
#print("--- Opening file to output index of tweets to ---")
#tweetsFile = open("tweetsIndex", "w")
# Matlab starts indexing from 1
#i = 1
#for tweet in tweetSet:
# tweetsFile.write(str(i) + " " + tweet.content + "\n")
# i = i + 1
#print("--- Closing file to output index of tweets to ---")
#tweetsFile.close()
# For each word, put a 1 for each column that has that word in the associated tweet.
# Put 1's in the matrix where the word appears and 0's where the word doesn't appear.
# Print to an output file as a .csv
print("--- Printing matrix to file ---")
# Each word is a row
for (word, occurence) in listOfWords:
# Each tweet is a column
for tweet in tweetSet:
print(word + " " + tweet.content)
tweetWordDict = tweet.dictionaryOfWords()
if word in tweetWordDict:
fileOut.write("1, ")
else:
fileOut.write("0, ")
fileOut.write("\n")
print("--- Finished printing matrix to file ---")
print("--- Closing file to output result to ---")
fileOut.close()
print("--- End ---")