/
preprocess.py
114 lines (91 loc) · 3.34 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import re
from stopwords import StopWords
from emoticons import Emoticons
class Preprocess:
def __init__(self, text):
self.text = text
self.stopWords = StopWords()
self.emoticons = Emoticons()
if not self.checkForEndingPunctuation():
self.text = self.text + '.'
#if not self.checkForEndingEmoticons():
# self.text = self.text + '.'
#returns a list
def preprocess(self):
self.truncateElongatedWords()
self.truncateElongatedPunctuations()
self.removeMentions()
self.removeHashtags()
self.removeStopWords()
self.completeContractions()
self.removeURLs()
return self.segmentText()
def segmentText(self):
segmentedText = self.splitText()
segmentedText.pop() #pop the last element of the list. From the regex, always appends a '' on the end of the
# list
segmentedText = self.tagEmoticons(segmentedText)
#segmentedText = self.truncateSuccessiveSpaces(segmentedText) #2 or more spaces is trasformed into 1 space
segmentedText = self.toLowerCase(segmentedText)
segmentedText = self.checkForLastSegmentElement(segmentedText) #for instances when last element of the
# segmented text is a '' and None for second segment
return segmentedText
def splitText(self):
pattern = '|'.join(self.emoticons.getEscapedEmoticons())
segmentedText = re.split(r"[.?!]|(" + pattern + ")", self.text)
return segmentedText
def tagEmoticons(self, segmentedText):
segmentedTextWithTaggedEmoticons = []
iter = 0
for i in range(len(segmentedText)):
if i%2 == 0: #if even
tmpLst = [segmentedText[i]]
else: #if odd
tmpLst.append(segmentedText[i])
segmentedTextWithTaggedEmoticons.append(tmpLst)
return segmentedTextWithTaggedEmoticons
def truncateSuccessiveSpaces(self, segmentedText):
newSegmentedText = []
for segment in segmentedText:
text = segment[0]
text = re.sub(' +', ' ', text)
segment = [text, segment[1]]
newSegmentedText.append(segment)
return newSegmentedText
def toLowerCase(self, segmentedText):
lowerCasedSegment = []
for segment in segmentedText:
text = segment[0]
lowerCasedSegment.append([text.lower(), segment[1]])
return lowerCasedSegment
def checkForEndingPunctuation(self):
return self.text.endswith('.') or self.text.endswith('!') or self.text.endswith('?')
def checkForLastSegmentElement(self, segmentedText):
segment = segmentedText[len(segmentedText) - 1]
if segment[0] == '' and segment[1] == None:
segmentedText.pop()
return segmentedText
def truncateElongatedWords(self):
self.text = re.sub(r'(.)\1{2,}', r'\1\1', self.text)
def truncateElongatedPunctuations(self):
self.text = re.sub(r'([!?.])\1{2,}', r'\1', self.text)
def removeMentions(self):
self.text = re.sub(r'@(\w+)', r'', self.text)
def removeHashtags(self):
#remove end of text hashtag
splitText = self.text.split(' ')
while True:
if splitText[len(splitText) - 1][0] == "#":
splitText.pop()
else:
break
self.text = ' '.join(splitText)
#remove hash symbol if hashtag is not in the end of text
self.text = re.sub(r'#', '', self.text)
def removeURLs(self):
self.text = re.sub(r'(\w+:\/\/\S+)*', '', self.text)
def removeStopWords(self):
patternStopWords = '|'.join(self.stopWords.getStopWords())
self.text = re.sub(r'\b(' + patternStopWords + r')\b', '', self.text)
def completeContractions(self):
self.text = re.sub(r"(.+)n't", 'not', self.text)