示例#1
0
def unify_dirs(key_dir,value_dir,encode=True):
    new_dir={}
    for key_i in key_dir:
        index=key_dir[key_i]
        all_endings=[]
        for end_i in index:
            all_endings+=value_dir[end_i]
        if(encode):
            key_i=code_digraphs(key_i)
            new_dir[key_i]=[code_digraphs(value_i) 
                                for value_i in all_endings]	
        else:
            new_dir[key_i]=all_endings 
    return new_dir	

#forms=build_forms(u'resources/lab2/pocz.dat',
#	         u'resources/lab2/konc.dat')
#print(forms.stats())
示例#2
0
文件: histogram.py 项目: tjacek/WSJN
def build_forms_histogram(filename, forms2basic, hist_size=0):
    text = tools.read_text(filename, clean_txt=False)
    words = tools.find_words(text)
    words = [code_digraphs(word_i) for word_i in words]
    forms = [forms2basic[word_i] for word_i in words if (word_i in forms2basic)]
    # print(len(forms))
    forms = tools.unique_list(forms)  # list(forms)
    # print(len(forms))
    # print(forms2basic)
    return build_histogram(forms, laplace_smoothing=True, size=hist_size)
示例#3
0
文件: histogram.py 项目: tjacek/WSJN
def build_forms_histogram(filename,forms2basic,hist_size=0):
    text=tools.read_text(filename,clean_txt=False)
    words=tools.find_words(text)    
    words=[code_digraphs(word_i) for word_i in words]
    forms=[ forms2basic[word_i] for word_i in words
                     if(word_i in forms2basic)]
    #print(len(forms))
    forms=tools.unique_list(forms)#list(forms)        
    #print(len(forms))
    #print(forms2basic)
    return build_histogram(forms,laplace_smoothing=True,size=hist_size)   
示例#4
0
文件: bayes.py 项目: tjacek/WSJN
 def correct(self,new_word,unique=False):
     new_word=code_digraphs(new_word)
     keys=self.forms_dict.all_basic()
     words=knn.nearest_k(new_word,keys,k=5,metric=norm_begin_metric)
     #tools.print_unicode(words)
     if(unique):
         full_words=[]
         for word_i in words:
             full_word_i=self.forms_dict.full_words([word_i])[0]
             full_words.append(full_word_i)
     else:
         full_words=self.forms_dict.full_words(words)
     prob_pairs=[ (word_i,self.p(new_word,word_i)) 
                     for word_i in full_words]
     prob_pairs.sort(key=lambda x: x[1], reverse=True)
     return prob_pairs
示例#5
0
 def correct(self, new_word, unique=False):
     new_word = code_digraphs(new_word)
     keys = self.forms_dict.all_basic()
     words = knn.nearest_k(new_word, keys, k=5, metric=norm_begin_metric)
     #tools.print_unicode(words)
     if (unique):
         full_words = []
         for word_i in words:
             full_word_i = self.forms_dict.full_words([word_i])[0]
             full_words.append(full_word_i)
     else:
         full_words = self.forms_dict.full_words(words)
     prob_pairs = [(word_i, self.p(new_word, word_i))
                   for word_i in full_words]
     prob_pairs.sort(key=lambda x: x[1], reverse=True)
     return prob_pairs
示例#6
0
文件: lab2.py 项目: tjacek/WSJN
def main_loop():
    full_words = tools.read_lines(FORMS_SOURCE, clean_text=False)
    words = [distance.code_digraphs(word_i) for word_i in full_words]
    correct_word = distance.curry_correct(words, full_words)
    tools.ui_loop(correct_word, get_word)
示例#7
0
文件: lab2.py 项目: tjacek/WSJN
def main_loop():
    full_words=tools.read_lines(FORMS_SOURCE,clean_text=False)
    words=[distance.code_digraphs(word_i) 
            for word_i in full_words]
    correct_word=distance.curry_correct(words,full_words)
    tools.ui_loop(correct_word,get_word)