def save_data(data_1,
              data_2,
              data_3,
              data_path_1,
              data_path_2,
              data_path_3,
              stop_words_path=''):
    stopwords = read_stopwords(stop_words_path)
    with open(data_path_1, 'w', encoding='utf-8') as f1:
        count_1 = 0
        for line in data_1:
            # print(line)
            if isinstance(line, str):
                seg_list = segment(line.strip(), cut_type='word')
                seg_list = remove_words(seg_list)
                # seg_words = []
                # for j in seg_list:
                #     if j in stopwords:
                #         continue
                #     seg_words.append(j)
                if len(seg_list) > 0:
                    seg_line = ' '.join(seg_list)
                    f1.write('%s' % seg_line)
                    f1.write('\n')
                    count_1 += 1
        print('train_x_length is ', count_1)

    with open(data_path_2, 'w', encoding='utf-8') as f2:
        count_2 = 0
        for line in data_2:
            if isinstance(line, str):
                seg_list = segment(line.strip(), cut_type='word')
                seg_list = remove_words(seg_list)
                # seg_words = []
                # for j in seg_list:
                #     if j in stopwords:
                #         continue
                #     seg_words.append(j)
                # if len(seg_list) > 0:
                seg_line = ' '.join(seg_list)
                f2.write('%s' % seg_line)
                f2.write('\n')
                count_2 += 1
        print('train_y_length is ', count_2)

    with open(data_path_3, 'w', encoding='utf-8') as f3:
        count_3 = 0
        for line in data_3:
            if isinstance(line, str):
                seg_list = segment(line.strip(), cut_type='word')
                seg_list = remove_words(seg_list)
                if len(seg_list) > 0:
                    seg_line = ' '.join(seg_list)
                    f3.write('%s' % seg_line)
                    f3.write('\n')
                    count_3 += 1
        print('test_y_length is ', count_3)
예제 #2
0
def indexing_search(text):
    key_word = remove_punctuation(text)
    if not key_word:
        return []
    rlist = []
    #如果是isbn, 则只使用isbn搜索
    c = re.match(r"(\d{13}|\d{10})",key_word.strip())
    if c:
        isbn = c.group(0)
        rlist += indexing_search_by_word(isbn)
        return rlist
    #其他情况, 空格搜索
    words = key_word.split()
    words.append(key_word)
    for w in words:
        rlist += indexing_search_by_word(w)
    rlist = list(set(rlist))
    if rlist:
        return rlist
    #空耳分词搜不到, 则尝试语义分词
    words = tokenizer.segment(key_word)
    for w in words:
        if isinstance(w,str) or isinstance(w,unicode):
            rlist += indexing_search_by_word(w)
    rlist = list(set(rlist))
    return rlist
예제 #3
0
def preprocess_sentence(sentence):
    # segment(sentence, cut_type='word', pos=False) 使用了jieba.lcut(sentence)
    # jieba.lcut 直接生成的就是一个list
    seg_list = segment(sentence.strip(), cut_type='word')
    seg_list = remove_words(seg_list)
    seg_line = ' '.join(seg_list)
    return seg_line
예제 #4
0
def preprocess_sentence(sentence):
    seg_list = segment(sentence.strip(), cut_type='word')
    seg_list = remove_words(seg_list, REMOVE_WORDS)
    # filter stopwords
    stopwords_path = '{}/datasets/stopwords.txt'.format(BASE_DIR)
    stopwords = read_stopwords(stopwords_path)
    seg_list = remove_words(seg_list, stopwords)

    seg_line = ' '.join(seg_list)
    return seg_line
예제 #5
0
def preprocess_sentence(sentence):
    seg_list = segment(sentence.strip(),
                       cut_type='word')  # 返回的是jieba.lcut(sentence)的结果
    seg_list = remove_words(seg_list)
    seg_line = ' '.join(seg_list)
    return seg_line
def preprocess_sentence(sentence):    # 将sentence分词后,再去除remove_words,以空格相连,组成新句子
    seg_list = segment(sentence.strip(), cut_type='word')     # 将每一行数据进行分词
    seg_list = remove_words(seg_list)    # 去除一行中应除去的字符
    seg_line = ' '.join(seg_list)
    return seg_line
예제 #7
0
def segment_sentence(data):
    source = []
    for line in data:
        source.append(segment(line.strip(), cut_type='char'))
    return source
예제 #8
0
def preprocess_sentence(sentence):
    seg_list = segment(sentence.strip(), cut_type='word')  # 切词,返回list
    seg_list = remove_words(seg_list)
    seg_line = ' '.join(seg_list)
    return seg_line
예제 #9
0
def indexing_by_sentence(book,sentence):
    sentence = remove_punctuation(sentence)
    if not sentence:
        return
    words = tokenizer.segment(sentence)
    indexing_by_words(book,words)