Exemplo n.º 1
0
def main():
    start = int(input("start:"))
    stop = int(input("stop:"))

    for i in range(start, stop + 1):
        dic, url_list = d.getdict()
        d.train_one_passage(dic,
                            url_list,
                            text_str='corpus/' + repr(i) + '-std.txt')
        d.output_to_dict(dic, url_list)
Exemplo n.º 2
0
def Query():

    dic,url_list=dictionary.getdict()
    word_list=[]
    for ele in dic:
        word_list.append(ele["Word"])

    queryword=queryLabel.clipboard_get()
    if queryword in word_list:QueryResult.set(queryword+' is IN dictionary')
    else:QueryResult.set(queryword +' is NOT in dictionary')
Exemplo n.º 3
0
def Query():

    dic,url_list=dictionary.getdict()
    word_list=[]
    for ele in dic:
        word_list.append(ele["Word"])

    queryword=queryLabel.clipboard_get()
    q_list=list(queryword)
    while '|' in q_list:q_list.remove('|')
    queryword=''.join(q_list)
    if queryword in word_list:QueryResult.set(queryword+' is IN dictionary')
    else:QueryResult.set(queryword +' is NOT in dictionary')
Exemplo n.º 4
0
    def Add_to_dictionary():
        print('现在开始执行Add_to_dictionary()')

        def output_to_dict(dic, url_list):
            def freshdict(dic, file_str):
                '''This output dic into a .dic file'''
                def format_fix(fix_dict):  # for Prefix and Suffix
                    ret = ''
                    for ele in fix_dict.keys():
                        ret = ret + ele + ':' + str(fix_dict[ele]) + ','
                    return ret

                f = open(file_str, 'w', encoding='UTF-8')
                for entry in dic:
                    #中文|Word|360|Num|简体:290,繁体:60,None:10|Pre|分词:230,自修:100,考试:20,None:10|Suf|
                    f.write("{0}|Word|{1}|Num|{2}|Pre|{3}|Suf|\n".format(
                        entry['Word'], entry['Num'], format_fix(entry['Pre']),
                        format_fix(entry['Suf'])))
                f.close()

            def freshurl(url_list, file_str):
                """This output new_url_list into a .log file"""
                f = open(file_str, 'w', encoding='UTF-8')
                for ele in url_list:
                    if ele not in urls:
                        f.write(ele)
                f.writelines(urls)
                f.close()

#Refresh Version Information

            f = open("dict/latest.log", "w")

            file_str = date_str + '.dic'
            f.write(file_str + '\n')
            f.close()
            freshdict(dic, "dict/" + file_str)

        start = int(
            input("""Now we add words in ariticles into a fresh dict!
Start:"""))
        stop = int(input("Stop:"))

        for i in range(start, stop + 1):
            dic, url_list = d.getdict()
            d.train_one_passage(dic,
                                url_list,
                                text_str='corpus/' + repr(i) + '-std.txt')
            d.output_to_dict(dic, url_list)
Exemplo n.º 5
0
        in_dic=False

        for ele in dic:
            if phrase == ele['Word']:
                in_dic=True
                std_score+=length_coefficent[len(phrase)]
                prefix_match=ele['Pre'].get(phrase_list[index-1],1)
                suffix_match=ele['Suf'].get(phrase_list[index+1],1)
                average_match=(prefix_match+suffix_match)/2
                #Max?Min?Ave?
#                print(prefix_match,suffix_match,average_num)
                ratio=float(average_match)/float(average_num)
                std_score+=math.log10(ratio*10)*10
#                print(phrase,math.log10(ratio*10)*10)
##                if (ratio>=1):std_score+=math.log10(ratio*10)*10
##                if (ratio<1):std_score+=math.sqrt(ratio)*10-10
                #The scoring method#

                break
        if (in_dic==False):std_score-=5 # this is a parameter

    return std_score

if __name__=='__main__':
#for test
    dic,u=d.getdict()
    text='''工作组|织|上|的|贸易|错|开|发票|等'''
    t_l=text.split('|')
    print(t_l,score_after_segment(t_l,dic))
Exemplo n.º 6
0
def use_final(s1, dic, url, index):
    result_str = final.segment(s1, dic)
    f = open('corpus/' + repr(index) + '-final.txt', 'w', encoding='UTF-8')
    f.write(url)
    f.write(result_str)

    f.close()


import sys
import os
sys.path.append(os.path.abspath('.'))

if __name__ == '___main__':
    start = int(input("start:"))
    stop = int(input("stop:"))
    ##start=10
    ##stop=10
    for i in range(start, stop + 1):
        f = open('corpus/' + repr(i) + '.txt', 'r', encoding='UTF-8')
        url = f.readline()  #Remove the URL in head
        s1 = f.read()
        f.close()
        dic, url_list = dictionary.getdict()
        ##    use_fmm(s1,dic,url,i)
        ##    use_stat(s1,dic,url,i)
        ##    use_stat_opt(s1,dic,url,i)
        ##    use_dp_opt(s1,dic,url,i)
        use_final(s1, dic, url, i)