def save_data(data_1, data_2, data_3, data_path_1, data_path_2, data_path_3, stop_words_path=''): stopwords = read_stopwords(stop_words_path) with open(data_path_1, 'w', encoding='utf-8') as f1: count_1 = 0 for line in data_1: # print(line) if isinstance(line, str): seg_list = segment(line.strip(), cut_type='word') seg_list = remove_words(seg_list) # seg_words = [] # for j in seg_list: # if j in stopwords: # continue # seg_words.append(j) if len(seg_list) > 0: seg_line = ' '.join(seg_list) f1.write('%s' % seg_line) f1.write('\n') count_1 += 1 print('train_x_length is ', count_1) with open(data_path_2, 'w', encoding='utf-8') as f2: count_2 = 0 for line in data_2: if isinstance(line, str): seg_list = segment(line.strip(), cut_type='word') seg_list = remove_words(seg_list) # seg_words = [] # for j in seg_list: # if j in stopwords: # continue # seg_words.append(j) # if len(seg_list) > 0: seg_line = ' '.join(seg_list) f2.write('%s' % seg_line) f2.write('\n') count_2 += 1 print('train_y_length is ', count_2) with open(data_path_3, 'w', encoding='utf-8') as f3: count_3 = 0 for line in data_3: if isinstance(line, str): seg_list = segment(line.strip(), cut_type='word') seg_list = remove_words(seg_list) if len(seg_list) > 0: seg_line = ' '.join(seg_list) f3.write('%s' % seg_line) f3.write('\n') count_3 += 1 print('test_y_length is ', count_3)
def indexing_search(text): key_word = remove_punctuation(text) if not key_word: return [] rlist = [] #如果是isbn, 则只使用isbn搜索 c = re.match(r"(\d{13}|\d{10})",key_word.strip()) if c: isbn = c.group(0) rlist += indexing_search_by_word(isbn) return rlist #其他情况, 空格搜索 words = key_word.split() words.append(key_word) for w in words: rlist += indexing_search_by_word(w) rlist = list(set(rlist)) if rlist: return rlist #空耳分词搜不到, 则尝试语义分词 words = tokenizer.segment(key_word) for w in words: if isinstance(w,str) or isinstance(w,unicode): rlist += indexing_search_by_word(w) rlist = list(set(rlist)) return rlist
def preprocess_sentence(sentence): # segment(sentence, cut_type='word', pos=False) 使用了jieba.lcut(sentence) # jieba.lcut 直接生成的就是一个list seg_list = segment(sentence.strip(), cut_type='word') seg_list = remove_words(seg_list) seg_line = ' '.join(seg_list) return seg_line
def preprocess_sentence(sentence): seg_list = segment(sentence.strip(), cut_type='word') seg_list = remove_words(seg_list, REMOVE_WORDS) # filter stopwords stopwords_path = '{}/datasets/stopwords.txt'.format(BASE_DIR) stopwords = read_stopwords(stopwords_path) seg_list = remove_words(seg_list, stopwords) seg_line = ' '.join(seg_list) return seg_line
def preprocess_sentence(sentence): seg_list = segment(sentence.strip(), cut_type='word') # 返回的是jieba.lcut(sentence)的结果 seg_list = remove_words(seg_list) seg_line = ' '.join(seg_list) return seg_line
def preprocess_sentence(sentence): # 将sentence分词后,再去除remove_words,以空格相连,组成新句子 seg_list = segment(sentence.strip(), cut_type='word') # 将每一行数据进行分词 seg_list = remove_words(seg_list) # 去除一行中应除去的字符 seg_line = ' '.join(seg_list) return seg_line
def segment_sentence(data): source = [] for line in data: source.append(segment(line.strip(), cut_type='char')) return source
def preprocess_sentence(sentence): seg_list = segment(sentence.strip(), cut_type='word') # 切词,返回list seg_list = remove_words(seg_list) seg_line = ' '.join(seg_list) return seg_line
def indexing_by_sentence(book,sentence): sentence = remove_punctuation(sentence) if not sentence: return words = tokenizer.segment(sentence) indexing_by_words(book,words)