-
Notifications
You must be signed in to change notification settings - Fork 0
/
langID.py
148 lines (120 loc) · 5.71 KB
/
langID.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#!/usr/bin/env python
# coding=utf-8
# Machine Learning language identification (document classification) program
# Implements a bag of words model with Naive Bayes Classifier
# Designed for use with languages which use the Latin alphabet.
import sys
import codecs
import math
import langlib
SMOOTHING_PARAM = 1 # Smoothing parameter for Laplace smoothing
# Dictionary of language objects
languages = {}
langlib.loadLanguages(languages)
def train():
""" Training mode. Assumes all training exercises have never been seen before. """
for lang in languages.iterkeys():
# Read in documents so that UTF-8 encoding is handled properly
f = codecs.open('trainingTexts/' + languages[lang].fileName, encoding='utf-8')
texts = f.read().split('====') # Individual documents are separated by ====
f.close()
for text in texts:
processed = langlib.processText(text)
# Convert processed into a Document representing a bag of words
document = langlib.Document()
previousWord = ''
for word in processed:
if word == previousWord:
document.words[word] += 1
else:
document.words[word] = 1
previousWord = word
# Update the language's vocabulary and write to disk
languages[lang].vocabUpdate(document)
languages[lang].write()
languages[lang].textCount += 1
langlib.totalTextCount += 1
# Update the language list file
langlib.updateLangList(languages)
def classify(fileName):
""" Classify mode. Identifies the language of the text specified in fileName """
f = codecs.open(fileName, 'r', encoding='utf-8')
text = f.read()
f.close()
processed = langlib.processText(text)
# Convert processed into a Document represnting a bag of words
document = langlib.Document()
previousWord = ''
for word in processed:
if word == previousWord:
document.words[word] += 1
else:
document.words[word] = 1
previousWord = word
# Calculate the posterior probability of each language given the input
langProbs = {}
# Sum over all languages P(l) * Product over all words in document of P(word|l)
# Using Laplace smoothing
# IT IS PROBABLY POSSIBLE TO REMOVE EVIDENCE since it is simply a constant which
# all probabilities are divided by
"""
evidence = 0
for l in languages.iterkeys():
langProb = languages[l].textCount / float(langlib.totalTextCount)
product = 1
for word in document.words:
if word in languages[l].vocab:
product *= float(languages[l].vocab[word] + SMOOTHING_PARAM)
product /= float(languages[l].wordCount + SMOOTHING_PARAM * languages[l].uniqueWordCount)
else:
product *= float(SMOOTHING_PARAM)
product /= float(languages[l].wordCount + SMOOTHING_PARAM * languages[l].uniqueWordCount)
evidence += langProb * product
"""
for lang in languages.iterkeys():
prior = languages[lang].textCount / float(langlib.totalTextCount)
priorExponent = math.floor(math.log(prior, 10))
priorSignificand = prior / (10 ** priorExponent)
prior = [priorSignificand, priorExponent] # Scientific notation [significand, exponent]
# Product over all words in document of P(word|lang) using Laplace smoothing
likelihood = [1, 0] # Scientific notation [significand, exponent]
denominator = float(languages[lang].wordCount + SMOOTHING_PARAM * languages[lang].uniqueWordCount)
denomExponent = math.floor(math.log(denominator, 10))
denomSignificand = denominator / (10 ** denomExponent)
for word in document.words:
if word in languages[lang].vocab:
probability = float(languages[lang].vocab[word] + SMOOTHING_PARAM)
#probExponent = math.floor(math.log(probability, 10))
#probSignificand = probability / (10 ** probExponent)
else:
probability = float(SMOOTHING_PARAM)
probExponent = math.floor(math.log(probability, 10))
probSignificand = probability / (10 ** probExponent)
# Divide the probability by the denominator which applies to all words in a given language
probSignificand /= denomSignificand
probExponent -= denomExponent
# Because numbers get very tiny, work in scientific notation
# significand * (10 ** exponent)
#probExponent = math.floor(math.log(probability, 10))
#probSignificand = probability / (10 ** probExponent)
#print probSignificand, probExponent
likelihood[0] *= probSignificand
likelihood[1] += probExponent
langProbs[lang] = [prior[0] * likelihood[0], prior[1] + likelihood[1]] # Scientific notation
#langProbs[lang] = prior * likelihood / evidence # May not be necessary to use evidence
# Normalize so that significand < 10
normalizerExponent = math.floor(math.log(langProbs[lang][0], 10))
langProbs[lang][0] /= 10 ** normalizerExponent
langProbs[lang][1] += normalizerExponent
for lang in langProbs:
print lang, langProbs[lang]
def main():
if len(sys.argv) == 2:
if sys.argv[1] == '-t': # Training mode
train()
else: # Classify mode
classify(sys.argv[1])
else:
print 'Use option -t for training mode.'
if __name__ == '__main__':
main()