def all_words(): data = retrieve('ordmyndalisti') if data is not None: return data else: data1 = to_unicode_or_bust(open(os.path.join(os.path.dirname(__file__), 'ordmyndalisti.txt'), 'r').read()) data2 = to_unicode_or_bust(open(os.path.join(os.path.dirname(__file__), 'ordmyndalisti2.txt'), 'r').read()) data = data1+data2 store('ordmyndalisti', data) return data
def perm_search(string): pmatched = [] errors = [] if len(string) > 10: errors.append(u"Af tæknilegum ástæðum er ekki hægt að finna stafarugl sem er lengra en 10 stafir.") else: #perms = list(set([u''.join(p) for p in permutations(list(string), len(string))])) perms = set(list(all_perms(to_unicode_or_bust(string)))) # way better! 35 milliseconds for 7 letter word! pmatched = list(perms.intersection(words)) logging.info(pmatched) #worst #print set(words) & set(perms) #best, 18 seconds for 7 letter words, 1.8 seconds for 6 letter word #pmatched = [x for x in words if x in perms] #bad, 40 seconds for 7 letter words, 4 seconds for 6 letter word #for word in words: # for p in perms: # if p == word: # pmatched.append(word) return {'matches':pmatched,'errors':errors}
def post(self): which = self.request.get('which') perms = self.request.get('perms', None) regex = self.request.get('regex', None) d = {} if which == 'regex': regex = ''.join([r'\s', to_unicode_or_bust(regex), r'\s']) rmatches = regex_search2(regex) d.update(regex=rmatches) if which == 'perms': pmatches = perm_search(perms) d.update(perms=pmatches) self.response.out.write(json.dumps(d))
def __insert_string_to_buffer(self, string, handle=None): try: unicode_string = to_unicode_or_bust(string, self.__encoding) utf8_string = unicode_string.encode("utf-8") buf = self.__document.Buffer buf.begin_not_undoable_action() buf.set_text(utf8_string) buf.end_not_undoable_action() buf.set_modified(False) except UnicodeDecodeError: self._error(i.exception_unicode_decode) except ValueError: self._error(i.exception_unicode_decode) except UnicodeEncodeError: self._error(i.exception_unicode_encode) return
def save_file(self, document, filename=None, encoding='utf-8'): # TODO: Implement File Save operation buf = document.Buffer string = buf.get_text(buf.get_start_iter(), buf.get_end_iter()) handler = open(document.filename, 'w') unicode_string = to_unicode_or_bust(string, encoding) handler.write(unicode_string) handler.flush() handler.close() last_mod = get_last_modification(document.filename) self.__document.last_modified_time = last_mod # 1. Check For Other Program Modifications by checking the last modified # time whith last file modified time # 2. Check For Permissions to Save # 3. Encode File before writing to disk # 4. Write to a tmp file # 5. Copy tmp File over original file (For Crash Prevent) # 6. Delete old file return True
def read_file(filename): f = open(os.path.join(os.path.dirname(__file__), 'static', filename)) return [to_unicode_or_bust(l.strip()) for l in list(f)]
def create_corpus_from_wiki(self, corpus_root, filename, output_dir): create_error_corpus = False valid_word_pat = ur'(?u)^\w+$' sentences = utils.get_sentences_for_text(corpus_root, filename) if sentences == None: return top_rev = [] top_rev_with_err = [] try: for s_list in sentences: s = ' '.join(s_list) if s.startswith('[Revision timestamp:'): self.num_rev += 1 else: if self.num_rev == 1: if len(s_list) >= self.min_sen_len: rev_sen = RevisionSentence(s_list) top_rev.append(rev_sen) elif self.num_rev > 1: for r in top_rev: if len(s_list) == len(r.orig_tokens): valid_errors = True errors = False old_curr_rev_sen = zip(r.orig_tokens, s_list) for t in old_curr_rev_sen: dist = utils.levenshtein_distance( t[0], t[1]) if dist > 0 and dist <= self.max_dist: # token must be a word orig_uni = utils.to_unicode_or_bust( t[0]) match = re.search( valid_word_pat, orig_uni) if match: errors = True elif dist > self.max_dist: valid_errors = False break if errors == True and valid_errors == True: print 'errr' r.add_err_sentence(s_list) create_error_corpus = True break except AssertionError: print 'Empty file' if create_error_corpus == True: with codecs.open(output_dir + '/' + filename, 'w', 'utf-8', errors='ignore') as f: for r in top_rev: if r.contains_spelling_errors() == True: orig_sen = ' '.join(r.orig_tokens) err_as_sen = map(lambda x: ' '.join(x), r.err_sen) orig_err_sen = [orig_sen] + err_as_sen to_write = '####'.join(orig_err_sen) to_write_uni = unicode(to_write, encoding='utf-8', errors='ignore') f.write(to_write_uni + u'\n')