Python segment 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: tokenizer

메소드/함수: segment

hotexamples.com에서의 예제들: 9

Python segment - 9개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 tokenizer.segment에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: preprocess.py 프로젝트: AILAN110/QA-Abstract-And-Reasoning

def save_data(data_1,
              data_2,
              data_3,
              data_path_1,
              data_path_2,
              data_path_3,
              stop_words_path=''):
    stopwords = read_stopwords(stop_words_path)
    with open(data_path_1, 'w', encoding='utf-8') as f1:
        count_1 = 0
        for line in data_1:
            # print(line)
            if isinstance(line, str):
                seg_list = segment(line.strip(), cut_type='word')
                seg_list = remove_words(seg_list)
                # seg_words = []
                # for j in seg_list:
                #     if j in stopwords:
                #         continue
                #     seg_words.append(j)
                if len(seg_list) > 0:
                    seg_line = ' '.join(seg_list)
                    f1.write('%s' % seg_line)
                    f1.write('\n')
                    count_1 += 1
        print('train_x_length is ', count_1)

    with open(data_path_2, 'w', encoding='utf-8') as f2:
        count_2 = 0
        for line in data_2:
            if isinstance(line, str):
                seg_list = segment(line.strip(), cut_type='word')
                seg_list = remove_words(seg_list)
                # seg_words = []
                # for j in seg_list:
                #     if j in stopwords:
                #         continue
                #     seg_words.append(j)
                # if len(seg_list) > 0:
                seg_line = ' '.join(seg_list)
                f2.write('%s' % seg_line)
                f2.write('\n')
                count_2 += 1
        print('train_y_length is ', count_2)

    with open(data_path_3, 'w', encoding='utf-8') as f3:
        count_3 = 0
        for line in data_3:
            if isinstance(line, str):
                seg_list = segment(line.strip(), cut_type='word')
                seg_list = remove_words(seg_list)
                if len(seg_list) > 0:
                    seg_line = ' '.join(seg_list)
                    f3.write('%s' % seg_line)
                    f3.write('\n')
                    count_3 += 1
        print('test_y_length is ', count_3)

예제 #2

파일 보기

파일: search.py 프로젝트: DengZuoheng/lib.eic.su

def indexing_search(text):
    key_word = remove_punctuation(text)
    if not key_word:
        return []
    rlist = []
    #如果是isbn, 则只使用isbn搜索
    c = re.match(r"(\d{13}|\d{10})",key_word.strip())
    if c:
        isbn = c.group(0)
        rlist += indexing_search_by_word(isbn)
        return rlist
    #其他情况, 空格搜索
    words = key_word.split()
    words.append(key_word)
    for w in words:
        rlist += indexing_search_by_word(w)
    rlist = list(set(rlist))
    if rlist:
        return rlist
    #空耳分词搜不到, 则尝试语义分词
    words = tokenizer.segment(key_word)
    for w in words:
        if isinstance(w,str) or isinstance(w,unicode):
            rlist += indexing_search_by_word(w)
    rlist = list(set(rlist))
    return rlist

예제 #3

파일 보기

def preprocess_sentence(sentence):
    # segment(sentence, cut_type='word', pos=False) 使用了jieba.lcut(sentence)
    # jieba.lcut 直接生成的就是一个list
    seg_list = segment(sentence.strip(), cut_type='word')
    seg_list = remove_words(seg_list)
    seg_line = ' '.join(seg_list)
    return seg_line

예제 #4

파일 보기

파일: preprocess.py 프로젝트: kongdzh/nlp-projects

def preprocess_sentence(sentence):
    seg_list = segment(sentence.strip(), cut_type='word')
    seg_list = remove_words(seg_list, REMOVE_WORDS)
    # filter stopwords
    stopwords_path = '{}/datasets/stopwords.txt'.format(BASE_DIR)
    stopwords = read_stopwords(stopwords_path)
    seg_list = remove_words(seg_list, stopwords)

    seg_line = ' '.join(seg_list)
    return seg_line

예제 #5

파일 보기

def preprocess_sentence(sentence):
    seg_list = segment(sentence.strip(),
                       cut_type='word')  # 返回的是jieba.lcut(sentence)的结果
    seg_list = remove_words(seg_list)
    seg_line = ' '.join(seg_list)
    return seg_line

예제 #6

파일 보기

파일: preprocess.py 프로젝트: geek-tong/homework_for_NLPLesson

def preprocess_sentence(sentence):    # 将sentence分词后，再去除remove_words，以空格相连，组成新句子
    seg_list = segment(sentence.strip(), cut_type='word')     # 将每一行数据进行分词
    seg_list = remove_words(seg_list)    # 去除一行中应除去的字符
    seg_line = ' '.join(seg_list)
    return seg_line

예제 #7

파일 보기

def segment_sentence(data):
    source = []
    for line in data:
        source.append(segment(line.strip(), cut_type='char'))
    return source

예제 #8

파일 보기

def preprocess_sentence(sentence):
    seg_list = segment(sentence.strip(), cut_type='word')  # 切词，返回list
    seg_list = remove_words(seg_list)
    seg_line = ' '.join(seg_list)
    return seg_line

예제 #9

파일 보기

파일: models.py 프로젝트: DengZuoheng/lib.eic.su

def indexing_by_sentence(book,sentence):
    sentence = remove_punctuation(sentence)
    if not sentence:
        return
    words = tokenizer.segment(sentence)
    indexing_by_words(book,words)