def cut_input(input): ''' cut a input string, return utf-8 string ''' result = norm_seg(input) wordsList = [] for w in result: if w.word.strip() == '' or w.flag.strip() == '': continue wordsList.append(w.word) with open(project_path + '/stopwords') as f: stop_words = [] data = f.readline().strip() while data: stop_words.append(data) data = f.readline().strip() drop_word_list = [] # 需要删掉的word for word in wordsList: if word.encode('utf-8').strip() in stop_words: drop_word_list.append(word) elif len(word.encode('utf-8').strip()) == len(word.strip()): drop_word_list.append(word) for drop_word in drop_word_list: wordsList.remove(drop_word) words = " ".join(wordsList) return words.encode('utf-8')
def cut_seg(self, sentence=''): ''' :param special_words: ['美丽 a', '转发 v'] :param industrys: 行业字典, 2汽车, 7美妆, 0 新词 :return: ''' words = norm_seg(sentence) return words
def cut_input(process_data): ''' cut a input string, return utf-8 string jieba中文分词的处理 ''' result = norm_seg(process_data) words_list = [] for w in result: if w.word.strip() == '': continue words_list.append(w.word) words = " ".join(words_list) return words.encode('utf-8')
def cut_input(input): ''' cut a input string, return utf-8 string ''' result = norm_seg(input) wordsList = [] for w in result: if w.word.strip() == '' or w.flag.strip() == '': continue wordsList.append(w.word) words = " ".join(wordsList) return words.encode('utf-8')
def cut_input(input, posFlag): ''' cut a input string, return utf-8 string ''' if posFlag == True: result = norm_seg(input) wordsList = [] for w in result: wordsList.append(w.word + '_' + w.flag) words = " ".join(wordsList) else: words = " ".join(norm_cut(input)) #return words.encode('utf-8') return words
def cut_input(input, flag=False): ''' cut a input string, return utf-8 string ''' result = norm_seg(input) wordsList = [] for w in result: if w.word.strip() == '' or w.flag.strip() == '': continue if flag: wordsList.append(w.word + '_' + w.flag) else: wordsList.append(w.word) return " ".join(wordsList).encode('utf8')
def count_words(self, lwords): """ :param lwords: :return: dwords type is dict ; key :word#flag value :freq """ jieba.enable_parallel(10) # start many processes word_flags = [] # {word#flag : freq},{word:sex} for context in lwords: for sub in norm_seg(context): w = sub.word if self.oGWF.isGeneralWord(w.encode('utf-8')) or w.strip() == '': continue if len(w) >= int(self.word_length): key = '%s#%s' % (w, sub.flag) word_flags.append(key) logger.info('count is starting') jieba.disable_parallel() dwords = Counter(word_flags) return dwords
def cut(self, sentence='', special_words=[], industrys=[]): ''' :param special_words: ['美丽 a', '转发 v'] :param industrys: 行业字典, 2汽车, 7美妆, 0 新词 :return: ''' f_special_dict = self.f_inner_dict if special_words and f_special_dict: f_special_dict = self.__set_new_dic(special_words) if os.path.exists(f_special_dict): jieba.set_dictionary(f_special_dict) load_industrydict(industrys) print sentence words = norm_seg(sentence) if special_words and os.path.exists( f_special_dict) and self.f_inner_dict != f_special_dict: cmd = 'rm -rf %s' % f_special_dict os.system(cmd) return words
def cut_input_plus(input, sBrand): ''' cut a input string, return utf-8 string ''' result = norm_seg(input) wordsList = [] wordsPosList = [] posDict = {'all': []} bIsBrand = False sBrand0 = "" for w in result: if w.word.strip() == '' or w.flag.strip() == '': continue w.word = w.word.strip().encode('utf8') w.flag = w.flag.strip().encode('utf8') if sBrand == w.word: bIsBrand = True if len(w.word.split(' ')) > 1: w.word = "#kong#".join(w.word.split(' ')) if bIsBrand: sBrand0 = w.word wordsList.append(w.word) wordsPosList.append(w.word + '_' + w.flag) if not w.flag in posDict: posDict[w.flag] = [] posDict[w.flag].append(w.word) posDict['all'].append(w.word) words_posFlag = " ".join(wordsPosList) words = " ".join(wordsList) if sBrand0 != "": sBrand = sBrand0 return bIsBrand, words, words_posFlag, posDict, sBrand
def cut_input(input): ''' cut a input string, return utf-8 string ''' result = norm_seg(input) wordsList = [] wordsPosList = [] posDict = {'all': []} for w in result: if w.word.strip() == '' or w.flag.strip() == '': continue wordsList.append(w.word) wordsPosList.append(w.word + '_' + w.flag) if not w.flag in posDict: posDict[w.flag] = [] posDict[w.flag].append(w.word.encode('utf-8')) posDict['all'].append(w.word.encode('utf-8')) words_posFlag = " ".join(wordsPosList) words = " ".join(wordsList) return words.encode('utf-8'), words_posFlag.encode('utf-8'), posDict
#!/usr/bin/env python # -*- coding: utf-8 -*- """ 1.test suggest_freq 2.test load industry_dict 3.test special word (If the word contains spaces, add to userword / specialword ) industry_dict = {2:"car_dict",7:"makeup.dict"} """ from jieba import suggest_freq from jieba.norm import norm_seg, load_industrydict #print suggest_freq('小黑瓶',True) #test 2 testword = ['长安欧尚', "睿骋cc", "行动力", "蓝水粉水小黑瓶"] load_industrydict([2, 7]) for i in testword: for word in norm_seg(i): print word.word, word.flag