Python Cut_Sentence 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: cut_sentence

클래스/타입: Cut_Sentence

hotexamples.com에서의 예제들: 6

Python Cut_Sentence - 6개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 cut_sentence.Cut_Sentence에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Cut_Sentence(4)

cut_with_weight(3)

cut(1)

예제 #1

파일 보기

파일: unigram_arithmetic.py 프로젝트: winsonlaw/ngram_engineArighmetic

def get_sentence_accuracy_top_one():
    '''样本文件中词的准确度'''
    from cut_sentence import Cut_Sentence
    matched_words_count = 0
    total_words_count = 312525.0
    cs = Cut_Sentence()
    correct_sentence_set = set()
    top_one_sentence_list = []
    correct_sentence_list = []
    checkout_filename = os.path.join(PATH, '0709modify',
                                     'aaaaaaaaaaaa_weight.txt')
    with codecs.open(checkout_filename, encoding='utf-8') as f:
        count = 0
        for line in f.readlines():
            if line.startswith('*'):
                # correct_sentence_list.extend(cs.cut_with_weight(line.strip()[1:]))
                # correct_sentence_list = cs.cut_with_weight(line.strip()[1:])
                correct_sentence_set = cs.cut_with_weight(line.strip()[1:])
                count = 0
            else:
                count += 1
                if count == 1:
                    sentence = line.split('\t')[0]
                    # top_one_sentence_list.extend(cs.cut_with_weight(sentence))
                    top_one_sentence_list = cs.cut_with_weight(sentence)
                    for words in top_one_sentence_list:
                        if words in correct_sentence_set:
                            matched_words_count += 1
    # print len(top_one_sentence_list), len(correct_sentence_list)
    print matched_words_count / total_words_count

예제 #2

파일 보기

파일: unigram_arithmetic.py 프로젝트: winsonlaw/ngram_engineArighmetic

def cuted_varify_sample():
    '''利用weight值对varify_sample文件进行切割'''
    from cut_sentence import Cut_Sentence
    ws = Cut_Sentence()
    word_input_role_dic = {}

    def _load_input_role():
        '''加载汉61633基础词库词为key与输入规则为value的字典'''
        input_role_filename = os.path.join(
            PATH, '0709modify', 'combine_5233_and_top60000_pinyin_role.txt')
        with codecs.open(input_role_filename, encoding='utf-8') as f:
            for line in f.readlines():
                splited_line = line.split('\t')
                word = splited_line[0]
                input_role = splited_line[-1].strip()
                word_input_role_dic[word] = input_role

    # _load_input_role()

    varify_sample_filename = os.path.join(PATH, '0709modify',
                                          'cuted_varify_sample.txt')
    output_filename = os.path.join(PATH, '0709modify', 'unigram',
                                   'only_cuted_sentence_varify_sample.txt')
    with codecs.open(varify_sample_filename, encoding='utf-8')as f,\
    codecs.open(output_filename, mode='wb', encoding='utf-8') as wf:
        for line in f.readlines():
            word_list = ws.cut_with_weight(line.strip())
            cuted_sentence = ' '.join(word_list)
            # input_role_list = [word_input_role_dic[item] for item in word_list]
            # input_role_str = ' '.join(input_role_list)
            # com_str = '\t'.join((line.strip(),input_role_str))

            wf.write(cuted_sentence + '\n')

예제 #3

파일 보기

파일: check_accuracy.py 프로젝트: bradbann/ngram_engineArighmetic

def get_words_accuracy_top_one():
    '''样本文件中词的准确度AW'''
    from cut_sentence import Cut_Sentence
    matched_words_count = 0
    # total_words_count = 312525.0
    cs = Cut_Sentence()
    correct_sentence_set = set()
    total_top_one_sentence_list = []
    top_one_sentence_list = []
    correct_sentence_list = []
    checkout_filename = os.path.join(PATH, '0709modify', 'cut_path_lenght_limit_50.txt')
    with codecs.open(checkout_filename, encoding='utf-8') as f:
        count = 0
        for line in f.readlines():
            if line.startswith('*'):
                correct_sentence_list.extend(cs.cut_with_weight(line.strip()[1:]))
                correct_sentence_set = cs.cut_with_weight(line.strip()[1:])
                count = 0
            else:
                count += 1
                if count == 1:
                    sentence = line.split('\t')[0]
                    total_top_one_sentence_list.extend(cs.cut_with_weight(sentence))
                    # total_top_one_sentence_list.extend(sentence.split())
                    top_one_sentence_list = cs.cut_with_weight(sentence)
                    for words in top_one_sentence_list:
                        if words in correct_sentence_set:
                            matched_words_count += 1
    print len(total_top_one_sentence_list), len(correct_sentence_list)
    print str(matched_words_count/float(len(total_top_one_sentence_list))*100)+'%'

예제 #4

파일 보기

파일: gen_linguistic_data_freq_26.py 프로젝트: winsonlaw/ngram_engineArighmetic

def gen_word_freq_from_linguistic_data():
    '''词表+句子语料'''
    cs = Cut_Sentence()
    whole_word_freq_dic = {}
    whole_word_freq_set = set()
    with codecs.open(src_filename, encoding='utf-8') as f:
        while True:
            line = f.readline()
            if not line:
                break
            splited_words_tuple = cs.cut(line)
            if len(splited_words_tuple) == 1:
                if splited_words_tuple[0] in whole_word_freq_set:
                    whole_word_freq_dic[splited_words_tuple[0]] += 1
                else:
                    whole_word_freq_set.add(splited_words_tuple[0])
                    whole_word_freq_dic[splited_words_tuple[0]] = 1
            else:
                for splited_words_param in splited_words_tuple:
                    if splited_words_param in whole_word_freq_set:
                        whole_word_freq_dic[splited_words_param] += 1
                    else:
                        whole_word_freq_set.add(splited_words_param)
                        whole_word_freq_dic[splited_words_param] = 1
    temp_filename = os.path.join(PATH, '0709modify', 'word_freq_from_95K.txt')
    word_freq_str_list = ['\t'.join((key,str(value)))+'\n' for (key,value) in whole_word_freq_dic.items()]
    with codecs.open(temp_filename, mode='wb', encoding='utf-8') as wf:
        wf.writelines(word_freq_str_list)

예제 #5

파일 보기

파일: unigram_arithmetic.py 프로젝트: bradbann/ngram_engineArighmetic

def cuted_varify_sample():
    '''利用weight值对varify_sample文件进行切割'''
    from cut_sentence import Cut_Sentence
    ws = Cut_Sentence()
    word_input_role_dic = {}

    def _load_input_role():
        '''加载汉61633基础词库词为key与输入规则为value的字典'''
        input_role_filename = os.path.join(PATH, '0709modify', 'combine_5233_and_top60000_pinyin_role.txt')
        with codecs.open(input_role_filename, encoding='utf-8') as f:
            for line in f.readlines():
                splited_line = line.split('\t')
                word = splited_line[0]
                input_role = splited_line[-1].strip()
                word_input_role_dic[word] = input_role
    # _load_input_role()

    varify_sample_filename = os.path.join(PATH, '0709modify', 'cuted_varify_sample.txt')
    output_filename = os.path.join(PATH, '0709modify', 'unigram', 'only_cuted_sentence_varify_sample.txt')
    with codecs.open(varify_sample_filename, encoding='utf-8')as f,\
    codecs.open(output_filename, mode='wb', encoding='utf-8') as wf:
        for line in f.readlines():
            word_list = ws.cut_with_weight(line.strip())
            cuted_sentence = ' '.join(word_list)
            # input_role_list = [word_input_role_dic[item] for item in word_list]
            # input_role_str = ' '.join(input_role_list)
            # com_str = '\t'.join((line.strip(),input_role_str))

            wf.write(cuted_sentence+'\n')

예제 #6

파일 보기

파일: gen_linguistic_data_freq_26.py 프로젝트: winsonlaw/ngram_engineArighmetic

def cut_lines_into_words():
    '''将行（句子）切割成词，其间以空格隔开'''
    from cut_sentence import Cut_Sentence
    cs = Cut_Sentence()
    for file_count in range(26, 29):
        print file_count
        src_filename = os.path.join(PATH, '0709modify', 'cuted_linguistic_stample', '%s.txt'%file_count)
        try:
            assert os.path.exists(src_filename)
        except AssertionError:
            print '%s does not exist !!'%src_filename

        with codecs.open(src_filename, encoding='utf-8') as f:
            cuted_lines_list = [' '.join(cs.cut_with_weight(line))+'\n' for line in f.readlines()]
        codecs.open(src_filename, mode='wb', encoding='utf-8').writelines(cuted_lines_list)