def get_sentence_accuracy_top_one():
    '''样本文件中词的准确度'''
    from cut_sentence import Cut_Sentence
    matched_words_count = 0
    total_words_count = 312525.0
    cs = Cut_Sentence()
    correct_sentence_set = set()
    top_one_sentence_list = []
    correct_sentence_list = []
    checkout_filename = os.path.join(PATH, '0709modify',
                                     'aaaaaaaaaaaa_weight.txt')
    with codecs.open(checkout_filename, encoding='utf-8') as f:
        count = 0
        for line in f.readlines():
            if line.startswith('*'):
                # correct_sentence_list.extend(cs.cut_with_weight(line.strip()[1:]))
                # correct_sentence_list = cs.cut_with_weight(line.strip()[1:])
                correct_sentence_set = cs.cut_with_weight(line.strip()[1:])
                count = 0
            else:
                count += 1
                if count == 1:
                    sentence = line.split('\t')[0]
                    # top_one_sentence_list.extend(cs.cut_with_weight(sentence))
                    top_one_sentence_list = cs.cut_with_weight(sentence)
                    for words in top_one_sentence_list:
                        if words in correct_sentence_set:
                            matched_words_count += 1
    # print len(top_one_sentence_list), len(correct_sentence_list)
    print matched_words_count / total_words_count
def cuted_varify_sample():
    '''利用weight值对varify_sample文件进行切割'''
    from cut_sentence import Cut_Sentence
    ws = Cut_Sentence()
    word_input_role_dic = {}

    def _load_input_role():
        '''加载汉61633基础词库词为key与输入规则为value的字典'''
        input_role_filename = os.path.join(
            PATH, '0709modify', 'combine_5233_and_top60000_pinyin_role.txt')
        with codecs.open(input_role_filename, encoding='utf-8') as f:
            for line in f.readlines():
                splited_line = line.split('\t')
                word = splited_line[0]
                input_role = splited_line[-1].strip()
                word_input_role_dic[word] = input_role

    # _load_input_role()

    varify_sample_filename = os.path.join(PATH, '0709modify',
                                          'cuted_varify_sample.txt')
    output_filename = os.path.join(PATH, '0709modify', 'unigram',
                                   'only_cuted_sentence_varify_sample.txt')
    with codecs.open(varify_sample_filename, encoding='utf-8')as f,\
    codecs.open(output_filename, mode='wb', encoding='utf-8') as wf:
        for line in f.readlines():
            word_list = ws.cut_with_weight(line.strip())
            cuted_sentence = ' '.join(word_list)
            # input_role_list = [word_input_role_dic[item] for item in word_list]
            # input_role_str = ' '.join(input_role_list)
            # com_str = '\t'.join((line.strip(),input_role_str))

            wf.write(cuted_sentence + '\n')
def get_words_accuracy_top_one():
    '''样本文件中词的准确度AW'''
    from cut_sentence import Cut_Sentence
    matched_words_count = 0
    # total_words_count = 312525.0
    cs = Cut_Sentence()
    correct_sentence_set = set()
    total_top_one_sentence_list = []
    top_one_sentence_list = []
    correct_sentence_list = []
    checkout_filename = os.path.join(PATH, '0709modify', 'cut_path_lenght_limit_50.txt')
    with codecs.open(checkout_filename, encoding='utf-8') as f:
        count = 0
        for line in f.readlines():
            if line.startswith('*'):
                correct_sentence_list.extend(cs.cut_with_weight(line.strip()[1:]))
                correct_sentence_set = cs.cut_with_weight(line.strip()[1:])
                count = 0
            else:
                count += 1
                if count == 1:
                    sentence = line.split('\t')[0]
                    total_top_one_sentence_list.extend(cs.cut_with_weight(sentence))
                    # total_top_one_sentence_list.extend(sentence.split())
                    top_one_sentence_list = cs.cut_with_weight(sentence)
                    for words in top_one_sentence_list:
                        if words in correct_sentence_set:
                            matched_words_count += 1
    print len(total_top_one_sentence_list), len(correct_sentence_list)
    print str(matched_words_count/float(len(total_top_one_sentence_list))*100)+'%'
def gen_word_freq_from_linguistic_data():
    '''词表+句子语料'''
    cs = Cut_Sentence()
    whole_word_freq_dic = {}
    whole_word_freq_set = set()
    with codecs.open(src_filename, encoding='utf-8') as f:
        while True:
            line = f.readline()
            if not line:
                break
            splited_words_tuple = cs.cut(line)
            if len(splited_words_tuple) == 1:
                if splited_words_tuple[0] in whole_word_freq_set:
                    whole_word_freq_dic[splited_words_tuple[0]] += 1
                else:
                    whole_word_freq_set.add(splited_words_tuple[0])
                    whole_word_freq_dic[splited_words_tuple[0]] = 1
            else:
                for splited_words_param in splited_words_tuple:
                    if splited_words_param in whole_word_freq_set:
                        whole_word_freq_dic[splited_words_param] += 1
                    else:
                        whole_word_freq_set.add(splited_words_param)
                        whole_word_freq_dic[splited_words_param] = 1
    temp_filename = os.path.join(PATH, '0709modify', 'word_freq_from_95K.txt')
    word_freq_str_list = ['\t'.join((key,str(value)))+'\n' for (key,value) in whole_word_freq_dic.items()]
    with codecs.open(temp_filename, mode='wb', encoding='utf-8') as wf:
        wf.writelines(word_freq_str_list)
def cuted_varify_sample():
    '''利用weight值对varify_sample文件进行切割'''
    from cut_sentence import Cut_Sentence
    ws = Cut_Sentence()
    word_input_role_dic = {}

    def _load_input_role():
        '''加载汉61633基础词库词为key与输入规则为value的字典'''
        input_role_filename = os.path.join(PATH, '0709modify', 'combine_5233_and_top60000_pinyin_role.txt')
        with codecs.open(input_role_filename, encoding='utf-8') as f:
            for line in f.readlines():
                splited_line = line.split('\t')
                word = splited_line[0]
                input_role = splited_line[-1].strip()
                word_input_role_dic[word] = input_role
    # _load_input_role()

    varify_sample_filename = os.path.join(PATH, '0709modify', 'cuted_varify_sample.txt')
    output_filename = os.path.join(PATH, '0709modify', 'unigram', 'only_cuted_sentence_varify_sample.txt')
    with codecs.open(varify_sample_filename, encoding='utf-8')as f,\
    codecs.open(output_filename, mode='wb', encoding='utf-8') as wf:
        for line in f.readlines():
            word_list = ws.cut_with_weight(line.strip())
            cuted_sentence = ' '.join(word_list)
            # input_role_list = [word_input_role_dic[item] for item in word_list]
            # input_role_str = ' '.join(input_role_list)
            # com_str = '\t'.join((line.strip(),input_role_str))

            wf.write(cuted_sentence+'\n')
def cut_lines_into_words():
    '''将行(句子)切割成词,其间以空格隔开'''
    from cut_sentence import Cut_Sentence
    cs = Cut_Sentence()
    for file_count in range(26, 29):
        print file_count
        src_filename = os.path.join(PATH, '0709modify', 'cuted_linguistic_stample', '%s.txt'%file_count)
        try:
            assert os.path.exists(src_filename)
        except AssertionError:
            print '%s does not exist !!'%src_filename

        with codecs.open(src_filename, encoding='utf-8') as f:
            cuted_lines_list = [' '.join(cs.cut_with_weight(line))+'\n' for line in f.readlines()]
        codecs.open(src_filename, mode='wb', encoding='utf-8').writelines(cuted_lines_list)