-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
96 lines (84 loc) · 2.91 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from collections import defaultdict, Counter
from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import stopwords, brown
import re
category = 'news'
stop_Words = stopwords.words("english")
# Normalization
''' text level ---> segmentation, tokenization, delete stop-words, OPTIONAL(stemming,lematization)
word level ---> splite,extract,delete,replace string
character level ---> case
'''
#01 Segmentation
sentences = brown.sents(categories=category)
tokens = brown.words(categories=category)
new_token = []
for w in tokens:
word = re.sub(r'[-[_\],`!?():{}&$#@%*+;/\'"\t\n\b0-9]', r'', w.lower())
if word != '' and word not in stop_Words:
new_token.append(word)
row_text = ' '.join(new_token)
#unsupervised learning ML alogrithm to detect end of sentences (EOS)
custom_sent_tokenizer = PunktSentenceTokenizer(row_text)
tokenized = custom_sent_tokenizer.tokenize(row_text)
last_text = ' '.join(tokenized)
#prediction Algorithm
def markov_chain(text):
words = text.split(' ')
myDict = defaultdict(list)
for currentWord, nextWord in zip(words[0:-1], words[1:]):
myDict[currentWord].append(nextWord)
myDict = dict(myDict)
return myDict
markov_return = markov_chain(last_text)
numOfKeys = len(markov_return)
listOfKeys = list(markov_return)
inputText = input("Enter your line: ")
input_tokens = inputText.split(' ')
input_tokens.insert(0, '<start>')
tokens = last_text.split(' ')
biGrams = []
i = 0
tempOfKeys = int(numOfKeys)
prediction = []
while tempOfKeys > 0:
if input_tokens[-1] == listOfKeys[tempOfKeys - 1]:
prediction = markov_return[listOfKeys[tempOfKeys - 1]]
print(prediction)
tempOfKeys = tempOfKeys - 1
item = tokens[i] + ' ' + tokens[i + 1]
biGrams.append(item)
i = i + 1
if len(prediction) != 0:
count = Counter(biGrams)
tempNumOfPre = int(len(prediction))
listOfFinal = []
y = 0
while tempNumOfPre > 0:
predAndInput = input_tokens[-1] + ' ' + prediction[y]
listOfFinal.append(predAndInput)
y = y + 1
tempNumOfPre = tempNumOfPre - 1
x = 0
dic = dict()
keys = []
values = []
numOfPrediction = len(prediction)
z = 0
counter_ = len(biGrams)
while numOfPrediction > 0:
while counter_ > 0:
if listOfFinal[z] == biGrams[x]:
keys.append(biGrams[x])
values.append((count[biGrams[x]] + 1) / (numOfPrediction + numOfKeys))
x = x + 1
counter_ = counter_ - 1
numOfPrediction = numOfPrediction - 1
z = z + 1
if (len(values) != 0):
for x,y in zip(keys,values):
dic[x] = y
result = {k: v for k, v in sorted(dic.items(), key=lambda item: item[1])}
print('We recommend this word to you : ', str(list(result)[-1]).split()[-1])
else:
print('No Predictions')