forked from ejanzer/menureader
/
translate.py
180 lines (156 loc) · 5.65 KB
/
translate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
from model.dish import Dish
from model.dict_entry import Dict_Entry
from model.food_word import Food_Word
from model.base import db_session
import config
import datetime as dt
from timing import time_elapsed
CHAINS = {}
def find_words(text):
"""Find all combinations of sequential characters within a string of characters."""
print "finding combinations"
length = len(text)
n = length - 1
num_combos = 2 ** (length - 1)
bins = []
for i in range(num_combos):
num = bin(i).rsplit('b', 1)[1]
num_str = num.zfill(n)
bins.append(num_str)
total_combos = []
for binary_num in bins:
combo = []
for i in range(n):
if binary_num[i] == '1':
combo.append(text[i])
combo.append(',')
else:
combo.append(text[i])
combo.append(text[-1])
combo = ''.join(combo)
combo = combo.split(',')
total_combos.append(combo)
return total_combos
def check_words(combinations):
"""Go through all the combinations of words and find their definitions using
food_words and the dictionary."""
translations = []
for c in combinations:
translation = []
found_def = True
for char in c:
food_word = Food_Word.find_match(char)
if food_word:
translation.append(food_word.get_json())
else:
entries = Dict_Entry.find_matches(char)
if entries != []:
for entry in entries:
translation.append(entry.get_json())
elif len(char) == 1:
# If the character isn't in the dictionary (usually punctuation)
d = {
"char": char,
"pinyin": "",
"english": ""
}
translation.append(d)
else:
found_def = False
break
if found_def:
return translation
def translate(text):
"""Attempt to translate text using food_words and then the CEDICT dictionary."""
start = dt.datetime.now()
words = find_words(text)
start = time_elapsed("Find words", start)
results = check_words(words)
start = time_elapsed("Check words", start)
return results
def search_dish_name(text):
"""Searches for text in the dishes database. If not found, translates text and
looks for similar dishes in database. Returns JSON data for dish or search results."""
# timing information, can delete later.
start = dt.datetime.now()
results = {}
if type(text) != unicode:
text = text.decode('utf-8')
if len(text) > 10:
# Most dish names are 3-5 characters.
# If Tesseract returned more than 10 characters, something probably went wrong.
print "Input text is too long."
return None
else:
# Find a matching dish, if it exists.
match = Dish.find_match(text)
if match:
# If result is found, return JSON representation of dish.
results = match.get_json()
start = time_elapsed("Dish lookup", start)
else:
# If no dish is found, return translation data and similar dishes, if they exist.
translation = translate(text)
start = time_elapsed("Translation", start)
results['translation'] = translation
# Find similar dishes and add to results.
if len(text) > 1:
similar_dishes = Dish.find_similar(text)
start = time_elapsed("Similar dish lookup", start)
similar_json = []
for similar_dish in similar_dishes:
dish_data = similar_dish.get_json_min()
similar_json.append(dish_data)
if similar_json != []:
results['similar'] = similar_json
return results
##### UNUSED FUNCTIONS #######
# I was contemplating using Markov chains for text correction, but
# I haven't implemented that yet.
def find_substitutes(text):
"""Try to guess what incorrect characters might be..."""
if CHAINS == {}:
generate_food_chains()
candidates = []
subs = []
for i in range(len(text)):
char = text[i]
if CHAINS.get(char):
candidates = []
candidates = CHAINS[char]
else:
if candidates != []:
# choose the most popular option from candidates
counts = {}
for candidate in candidates:
if counts.get(candidate):
counts[candidate] += 1
else:
counts[candidate] = 1
max_count = 0
chosen = None
for candidate, count in counts.iteritems():
if count > max_count:
max_count = count
chosen = candidate
if chosen:
subs.append((chosen, i))
candidates = []
return subs
def generate_food_chains():
words = []
food_words = Food_Word.get_all_words()
dishes = Dish.get_all_dishes()
words.extend(food_words)
words.extend(dishes)
# Generate Markov chains.
# Since dish names are short, I'm using an n-gram size of 1.
for i in range(len(words)):
word = words[i]
for j in range(len(word) - 1):
char = word[j]
next = word[j + 1]
if CHAINS.get(char):
CHAINS[char].append(next)
else:
CHAINS[char] = [next]