def worker(item): size = 0 ids, masked_ids = [], [] index = item[1] item = item[0] min_index = item.offsets[1][ 0] #get original index of first word in encoded segment max_index = max(item.offsets)[ 1] #get original index of last word in encoded segment words = list(jieba.tokenize(data[index][min_index:max_index])) arr = np.array(item.ids, dtype=np.int32) if (np.count_nonzero(arr) > 10): masked_id = mask_ids(item, words) if masked_id is not None: ids.append(arr) masked_ids.append(np.array(masked_id, dtype=np.int32)) size += 1 for overflowing in item.overflowing: min_index = overflowing.offsets[1][0] max_index = max(overflowing.offsets)[1] words = list(jieba.tokenize(data[index][min_index:max_index])) arr = np.array(overflowing.ids, dtype=np.int32) if (np.count_nonzero(arr) > 10): masked_id = mask_ids(overflowing, words) if masked_id is not None: ids.append(arr) masked_ids.append(np.array(masked_id, dtype=np.int32)) size += 1 return ids, masked_ids, size
def word_process(text): """ :param text: :return: """ result = [] print(jieba.tokenize(text)) for (word, start, end) in jieba.tokenize(text): pseg_data = [(w, f) for (w, f) in pseg.cut(word)] result.append((pseg_data, start, end)) # result = word_process('我明天去吃饭') print(result) raw_entities = [] for (item_posseg, start, end) in result: part_of_speech = ["nr", "ns", "nt", "t"] for (word_posseg, flag_posseg) in item_posseg: print(word_posseg) print(flag_posseg) if flag_posseg in part_of_speech: raw_entities.append({ 'start': start, 'end': end, 'value': word_posseg, 'entity': flag_posseg }) print(raw_entities)
def getTest_feature(test_data): paragraphs = [] questions = [] test_ids = [] # store each data row temporally tmp_x_row = [] #get data position subjects = test_data['data'] for subject in subjects: # subject contains title and *paragraphs* for paragraph in subject['paragraphs']: # paragraphs contains *context* and *qas* context = list( jieba.tokenize(paragraph['context'].replace("\n", ""))) for qa in paragraph['qas']: ###################################### paragraphs.append(context) questions.append(list(jieba.tokenize(qa['question']))) ####################################### test_ids.append( qa['id'] ) # append question:string to tmp_x_row (behind context:string) #check if every question have unique answer return paragraphs, questions, test_ids
def analyse_fenci(): # 2.4 分词分析:进一步,我们需要对文本信息进行相关分析,如返回词语所在位置、返回关键词等等。 # 2.4.1返回词语所在位置 import jieba.analyse print("1.采取精准模式结果:") # print([item for item in jieba.tokenize(u"数据分析与数据挖掘的应用")]) for item in jieba.tokenize(u"数据分析与数据挖掘的应用"): print item[0], item[1], item[2] print("-------------------") print("2.采取搜索模式结果:") # print([item for item in jieba.tokenize("数据分析与数据挖掘的应用", mode="search")]) for item in jieba.tokenize(u"数据分析与数据挖掘的应用", mode="search"): print item[0], item[1], item[2] # 2.4.2提取文本中的关键词 print '提取文本中的关键词:' #其结果是结合文中出现的词频与字典中的词频进行排序 import jieba.analyse # print(jieba.analyse.extract_tags("我喜欢广州小蛮腰", 3)) # print(jieba.analyse.extract_tags("我喜欢广州广州小蛮腰", 3)) # print(jieba.analyse.extract_tags("我喜欢广州广州广州小蛮腰", 3)) for item in jieba.analyse.extract_tags("我喜欢广州小蛮腰", 3): print item + ' ', print '' for item in jieba.analyse.extract_tags("我喜欢广州广州小蛮腰", 3): print item + ' ', print '' for item in jieba.analyse.extract_tags("我喜欢广州广州广州小蛮腰", 3): print item + ' ', print ''
def fun5(): print("默认的tokenize") result = jieba.tokenize(u"自然语言处理非常有用") for tk in result: print('%s\t\t atart: %d \t\t end:%d' % (tk[0], tk[1], tk[2])) print("\n------------分割线-------------\n") print("搜索模式的tokenize") result = jieba.tokenize(u"自然语言处理非常有用", mode='search') for tk in result: print('%s\t\t atart: %d \t\t end:%d' % (tk[0], tk[1], tk[2]))
def _get_same_words_with_cut(self, source: str, target: str): """ 使用结巴分词来抽取相同词 """ res_words: [Word] = [] source_cut = [Word(*word) for word in jieba.tokenize(source)] target_cut = [Word(*word) for word in jieba.tokenize(target)] for word in source_cut: if word in target_cut and word.text not in STOPWORDS and len(word.text) >= self.least_word_len: res_words.append(word) return res_words
def tokenize(self, message: Message, attribute: Text) -> List[Token]: import jieba text = message.get(attribute) if self.component_config.get("case_sensitive", False): tokenized = jieba.tokenize(text.lower()) else: tokenized = jieba.tokenize(text) tokens = [Token(word, start) for (word, start, end) in tokenized] return self._apply_token_pattern(tokens)
def tokenize(): """ 分词 :return: """ s = "周大福是创新办主任也是云计算方面的专家" result = jieba.tokenize(s) logger.info("普通模式") for tk in result: logger.info("word: {0} \t\t start: {1} \t\t end: {2}".format(tk[0], tk[1], tk[2])) logger.info("\n搜索模式") result = jieba.tokenize(s, mode='search') for tk in result: logger.info("word: {0} \t\t start: {1} \t\t end: {2}".format(tk[0], tk[1], tk[2]))
def jieba_fenci_for_crawl_doc(doc): """结巴分词 """ for lib in JIEBA_CUSTOM_LIBS: prodict = os.path.join(settings.STATICFILES_DIRS[0], 'jiebadic', lib[0]) try: jieba.load_userdict(prodict) except IOError: continue rs = ['\xa0', '一、', '二、', '三、', '四、', '五、', '六、', '七、', '八、', '九、', '十、'] for r in rs: doc = doc.replace(r, '') regex = re.compile(r'[\n\r\t,.:\-";()。、:,的<>》《()]') # 去除换行 回车 制表符 中文标点符号 t = regex.sub("", doc) fenci_data = jieba.tokenize(t) # 结巴分词 return fenci_data
def getChList(docStrByte): ## 传入一个文档的二进制代码,返回中文分词后的结果,用空格把中文分词的词 inputStr = str(docStrByte, encoding = 'gbk', errors = 'ignore').lower()#二进制转为字符串,英文字母转为小写 strList = ''.join(inputStr.split('\n'))#删去换行符,连接每行成为一个段落 rawTokens = list(jieba.tokenize(strList))#中文分词 #stopWord 是 一个字典,每个key 是一个停用词,value都是None fSW = open('stopwords.txt', 'r', encoding = 'utf-8', errors = 'ignore').read() stopWord = {}.fromkeys(fSW.split('\n')) stopWord[''] = None final = [] s = nltk.stem.SnowballStemmer('english') for seg in rawTokens: # print(seg[0].strip()) rawWord = seg[0].strip()#strip()函数,去除字符串前后的空格 if (rawWord.isalpha()):#如果是英文单词,则提取词干 word = s.stem(rawWord) else: word = rawWord if word not in stopWord:#去除停用词 final.append(word)#最后返回list return final
def high_freq_words(): sentence = '我喜欢苏州的苏州中心,上海,上海的东方明珠' words = jieba.analyse.extract_tags(sentence, topK=3) print(f'top 3 的词语 {words}') # 返回词语的位置 words_loc = jieba.tokenize(sentence) print(f'各个词语的位置{list(words_loc)}')
def tokenize(self, text): # type: (Text) -> List[Token] import jieba tokenized = jieba.tokenize(text) tokens = [Token(word, start) for (word, start, end) in tokenized] return tokens
def __call__(self, text, **kargs): token = Token() words = set() words_list = [] for (i, start_pos, stop_pos) in jieba.tokenize(text, mode='search'): i = i.strip() if not i: continue if i in words: continue if i in punct: continue words.add(i) words_list.append(i) for w in words: if not accepted_chars.match(w): if len(w) <= 1: continue token.original = token.text = w token.pos = start_pos token.startchar = start_pos token.endchar = stop_pos yield token
def entity_rec(request): req = request.body return req print(req) entity_d = { 'person': [], 'fund': [], 'company': [], 'industry': [], 'stock': [] } index_l = [0 for i in range(len(news))] result = jieba.tokenize(news) start = time.time() for k in result: if k[0] in person_list: entity_d['person'] = entity_d['person'] + [k[0]] index_l[k[1]:k[2]] = [1 for k in range(k[2] - k[1])] if k[0] in fund_list: entity_d['fund'] = entity_d['fund'] + [k[0]] index_l[k[1]:k[2]] = [2 for k in range(k[2] - k[1])] if k[0] in company_list: entity_d['company'] = entity_d['company'] + [k[0]] index_l[k[1]:k[2]] = [3 for k in range(k[2] - k[1])] if k[0] in industry_list: entity_d['industry'] = entity_d['industry'] + [k[0]] index_l[k[1]:k[2]] = [4 for k in range(k[2] - k[1])] if k[0] in stock_list: entity_d['stock'] = entity_d['stock'] + [k[0]] index_l[k[1]:k[2]] = [5 for k in range(k[2] - k[1])] print("--- %s seconds ---" % (time.time() - start)) print(json.dumps({'entity_d': entity_d, 'index_l': index_l})) return json.dumps({'entity_d': entity_d, 'index_l': index_l})
def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str): result = jieba.tokenize(cas.sofa_string) for tk in result: prediction = self.create_prediction(cas, layer, feature, tk[1], tk[2], tk[0]) cas.add_annotation(prediction)
def summarize(text, cut_search, window=100): content = get_content(doc.get('path')) tokres = jieba.tokenize(content, mode='search') search_words = {} for i in range(len(cut_search)): search_words[cut_search[i]] = i kaps = [] for x in tokres: if x[0] in search_words.keys(): kaps.append((x[1], x[2], search_words[x[0]])) kaps.sort(key=(lambda x: x[0])) nextitem = 0 maxv, s, e = 0, 0, 0 for i in range(len(kaps)): end = kaps[i][0] + window while nextitem < len(kaps) and kaps[nextitem][1] <= end: nextitem += 1 exc, rni = nextitem - i, nextitem while rni < len(kaps) and kaps[rni][0] < end: if kaps[rni][1] <= end: exc += 1 rni += 1 if exc > maxv: maxv, s, e = exc, i, rni lens = kaps[s][0] kaps = kaps[s:e] maxk = max((x[1] for x in kaps if x[1] <= lens + window)) lens -= (lens + window - maxk) / 2 if lens + window > len(content): lens = len(content) - window if lens < 0: lens = 0 return maxv, lens, len(content), content[lens:lens + window], kaps
def _load_cedict(filename, hsk=None): cedict = defaultdict(list) with open(filename, 'r', encoding="utf-8") as f: for line in f: if line.startswith('#'): continue tr, sm, py, transl = re.match(r"(\S*) (\S*) \[(.*)\] \/(.*)\/", line).groups() transl = transl.split('/') transl = '/'.join([ t for t in transl if not t.startswith('see also ') and not t.startswith('variant of') ]) cedict[sm].append((tr, py, transl)) # Find compounds with jieba num = 0 compound_parts = {} for sm, entries in cedict.items(): # search mode will produce compounds and their parts tokens = list(jieba.tokenize(sm, mode='search')) parts = [t for t in tokens if t[2] - t[1] < len(sm)] compound_parts[sm] = parts # Join multiple sound characters (多音字) cedict = { sm: (sm, entries, compound_parts[sm]) for sm, entries in cedict.items() } return cedict
def build_word_level_vocabulary_all(train_file, valid_file=None, test_file=None): sentences = list() with codecs.open(train_file, encoding='utf-8') as f_train: for line in f_train: x = json.loads(line) sentences.extend([x['A'].strip(), x['B'].strip(), x['C'].strip()]) if valid_file: with codecs.open(valid_file, encoding='utf-8') as f_valid: for line in f_valid: x = json.loads(line) sentences.extend( [x['A'].strip(), x['B'].strip(), x['C'].strip()]) if test_file: with codecs.open(test_file, encoding='utf-8') as f_test: for line in f_test: x = json.loads(line) sentences.extend( [x['A'].strip(), x['B'].strip(), x['C'].strip()]) corpus = u''.join(sentences) word_list = list(set([tk[0] for tk in jieba.tokenize(corpus)])) return dict((word, idx + 1) for idx, word in enumerate(word_list))
def tokenize(sentence, mode='default'): """ 切词并返回切词位置 :param sentence: :return: (word, start_index, end_index) model='search' """ return list(jieba.tokenize(sentence, mode=mode))
def tokenize(self, text: Text, attribute=MESSAGE_TEXT_ATTRIBUTE) -> List[Token]: import jieba text = self.preprocess_text(text, attribute) tokenized = jieba.tokenize(text) tokens = [Token(word, start) for (word, start, end) in tokenized] return tokens
def tokenize(sentence): """ 切词 :param sentence: :return: (word, start_index, end_index) model='search' """ return list(jieba.tokenize(sentence, mode='search'))
def split(self, input_s): self.s = input_s self.token = jieba.tokenize(self.s) num_en = 0 num_zh = 0 for t in self.token: if not t[0].isspace(): if t[0] in ',,"\'‘’“”#@%<>《》{}【】[]。,!!??': self.symbol.append(t) else: lang = langid.classify(t[0])[0] if lang == "en": self.english.append(t) num_en += 1 elif lang == "zh": self.chinese.append(t) num_zh += 1 else: self.other.append(t) if num_en == 1 and num_zh == 1: code_mix = 1 if num_en == 0 and num_zh == 0: self.note = "other" elif num_en > num_zh: self.note = "en" self.translate_en_zh() else: self.note = "zh" self.translate_zh_en()
def correct(ss): ''' Correct sentence ss ''' # Returns list of tuples (word, st, en) mode='search' tokens = list(jieba.tokenize(ss)) print('Segmented sentence is {}'.format(''.join( [str(token) for token in tokens]))) segranges = [[token[1], token[2]] for token in tokens] _, _, outranges = score_sentence(ss) if outranges: cranges = merge_ranges(get_overlap_ranges(outranges, segranges)) for crange in cranges: print('Correct range is {}'.format(crange)) st, en = crange print('Possible wrong segment is {}'.format(ss[st:en])) pwrong = ss[st:en] # seg_list = jieba.cut(pwrong) # error_string = ", ".join(seg_list) # errors = error_string.split(", ") # cgram = "" # for error in errors: cgram = auto_correct(pwrong, cn_dict, word_freq) ss = ss[:st] + cgram + ss[en:] print('Corrected pinyin is {}'.format(cgram)) cgram2 = correct_ngram_2(ss, st, en) print('Corrected ngram is {}'.format(cgram2)) ss = ss[:st] + cgram2 + ss[en:] else: cranges = [] print('No segment to correct.') return ss, cranges
def cuttest(test_sent): global g_mode for n in re_num.finditer(test_sent): print(n.start(), n.end(), n.group()) result = jieba.tokenize(test_sent, mode=g_mode) for tk in result: print("word %s\t\t start: %d \t\t end:%d" % (tk[0], tk[1], tk[2]))
def get_words_jd(input_file): max_length_words = 100 length = 0 jd_str ="" jd_position = {} jd_words =[] with open(input_file) as f: lines = f.readlines() for line in lines: if not line:continue line = line.strip() line = re.sub(r"\s+","",line) line = line.decode('utf-8') for word in line: jd_str += word.encode('utf-8') length += 1 if length >= max_length_words:break result = jieba.tokenize(jd_str.decode('utf-8')) for tk in result: if tk[0].encode('utf-8') not in stop_words and tk[0].encode('utf-8') not in stop: jd_words.append(tk[0].encode('utf-8')) if not jd_position.has_key(tk[0].encode('utf-8')): jd_position[tk[0].encode('utf_8')] = {"start_pos" : tk[1], "end_pos" : tk[2]} return jd_str, jd_words, jd_position
def test_tokenize(): """ 测试token。 :return: """ # 生成token串 result = jieba.tokenize("永和服装饰品有限公司") for tk in result: common_logger.info("word {0}\t\t start:{1}\t\t end:{2}".format( tk[0], tk[1], tk[2])) # 搜索模式的token串 result = jieba.tokenize("永和服装饰品有限公司", mode="search") for tk in result: common_logger.info("word {0}\t\t start:{1}\t\t end:{2}".format( tk[0], tk[1], tk[2]))
def jieba_split( self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]: splits = [] for token, start, stop in jieba.tokenize(str(normalized_string)): splits.append(normalized_string[start:stop]) return splits
def test_tfidf(): lines = open('D:\\Python\\Data\\NBA.txt', encoding='utf-8').read() print(type(lines)) # 基于TF-IDF算法的关键词抽取 words = analyse.extract_tags(lines, topK=20, withWeight=True, allowPOS=()) print(words) # 基于TextRank算法的关键词抽取 words = analyse.textrank(lines, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v')) print(words) words = analyse.textrank(lines, topK=20, withWeight=False, allowPOS=('ns', 'n')) print(words) # 词性标注 words = pseg.cut('我爱自然语言处理') # print(list(words)) for word, flag in words: print(word, flag) # Tokenize:返回词语在原文的起止位置 result = jieba.tokenize('我爱自然语言处理') print(list(result))
def ann_rebuild(filename): jieba.load_userdict("jiebadic.txt") rf = codecs.open(filename, encoding='utf-8') annotation = {} for line in rf: if line.startswith("T"): word = line.strip().split('\t')[-1] type_offset = line.strip().split('\t')[1].split(' ') type = type_offset[0] start = int(type_offset[1]) final_end = int(type_offset[-1]) result = jieba.tokenize(word) for i, tk in enumerate(result): end = start + tk[2] - tk[1] if i == 0: type0 = "B-" + type annotation[(start, end)] = type0 elif end == final_end: type1 = "E-" + type annotation[(start, end)] = type1 else: type2 = "I-" + type annotation[(start, end)] = type2 start = end rf.close() return annotation
def tokenizer(filename): word_counter = collections.defaultdict(int) with open(filename) as f: for line in f: for word in jieba.tokenize(line.decode('utf-8')): word_counter[word[0]] += 1 return word_counter
def feature_embeding(comment): size = 15 par = 1 data = pd.read_excel('lstm_data/feature_word.xlsx', index=None) definite_words = list(data['肯定词']) positive_words = list(data['正向']) negative_words = list(data['负向']) imagine_words = list(data['假想词']) deny_words = list(data['否定词']) inter_words = list(data['疑问词']) assume_words = list(data['假定词']) feature_embed = np.zeros((len(comment), maxlen_context, 1 * size)) for i, t in enumerate(comment): token = jieba.tokenize(t[:maxlen_context]) for tk in token: if tk[0] in deny_words: feature_embed[i, tk[1]:tk[2], 0:size] = par if tk[0] in inter_words: feature_embed[i, tk[1]:tk[2], size:2 * size] = par if tk[0] in assume_words: feature_embed[i, tk[1]:tk[2], 2 * size:3 * size] = par if tk[0] in definite_words: feature_embed[i, tk[1]:tk[2], 3 * size:4 * size] = par if tk[0] in positive_words: feature_embed[i, tk[1]:tk[2], 4 * size:5 * size] = par if tk[0] in negative_words: feature_embed[i, tk[1]:tk[2], 5 * size:6 * size] = par if tk[0] in imagine_words: feature_embed[i, tk[1]:tk[2], 6 * size:7 * size] = par return feature_embed
def getChList(docStrByte): inputStr = str(docStrByte, encoding='gbk', errors='ignore') ## filter the first several sentence strList = list(i for i in inputStr.split('\n')) # print(strList) startLine = 0 for i in range(len(strList)): if (strList[i].startswith('【')): startLine += 1 else: break # print(strList[startLine:]) rawTokens = list(jieba.tokenize(''.join(strList[startLine:]))) # stopWord = {}.fromkeys([line for line in open('stopwords.txt','r',encoding = 'gbk', errors = 'ignore')]) fSW = open('stopwords.txt', 'r', encoding='utf-8', errors='ignore').read() # print(fSW.split('\n')[:99]) stopWord = {}.fromkeys(fSW.split('\n')) # print(stopWord) stopWord[''] = None # for (k,v) in stopWord.items(): # print(k, ',', v) final = '' for seg in rawTokens: # print(seg) # seg.encode('gbk') word = seg[0].strip() if word not in stopWord: final += (' ' + word) #if using final is not good # else: # print(seg) # print(type(final)) return final
def tokenize(sentence,addwords=None): if(addwords!=None): for word in addwords: jieba.add_word(word) tokens = [] for term in jieba.tokenize(sentence): tokens.append(term[0]) return tokens
def testTokenize_NOHMM(self): for content in test_contents: result = jieba.tokenize(content,HMM=False) assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error" result = list(result) assert isinstance(result, list), "Test Tokenize error on content: %s" % content for tk in result: print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]), file=sys.stderr) print("testTokenize_NOHMM", file=sys.stderr)
def segment(raw_text): tokens = jieba.tokenize(raw_text) seg_list = [w for (w, start_pos, stop_pos) in tokens if token_condition(w)] seg_freq_counter = Counter(seg_list) seg_freq = dict(seg_freq_counter) return json.dumps(seg_freq)
def testTokenize(self): for content in test_contents: result = jieba.tokenize(content.decode('utf-8')) assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error" result = list(result) assert isinstance(result, list), "Test Tokenize error on content: %s" % content for tk in result: print >>sys.stderr, "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]) print >> sys.stderr, "testTokenize"
def tokenize(sentence, mode='default'): """ 切词并返回切词位置 :param sentence: :param mode: :return: (word, start_index, end_index) model='search' """ import logging jieba.default_logger.setLevel(logging.ERROR) return list(jieba.tokenize(sentence, mode=mode))
def tokenize(self, text): # type: (Text) -> List[Token] import jieba if self.dictionary_path is not None: self.load_custom_dictionary(self.dictionary_path) tokenized = jieba.tokenize(text) tokens = [Token(word, start) for (word, start, end) in tokenized] return tokens
def how_to_use(): """待分词的字符串可以是 unicode 或 UTF-8 字符串、GBK 字符串。 注意:不建议直接输入 GBK 字符串,可能无法预料地错误解码成 UTF-8 jieba.cut 以及 jieba.cut_for_search 返回的结构都是一个可迭代的 generator,可以使用 for 循环来获得分词后得到的每一个词语(unicode), 或者用jieba.lcut 以及 jieba.lcut_for_search 直接返回 list""" dict_path = 'user_dict/user_dict.txt' seg_list = jieba.cut("我换不行北京清华大学", cut_all=False) print("Default Mode: " + "/ ".join(seg_list)) # 精确模式 jieba.load_userdict(dict_path) seg_list = jieba.cut("我来到北京清华大学", cut_all=True) print("Full Mode: " + "/ ".join(seg_list)) # 全模式 seg_list = jieba.cut("我来到北京清华大学", cut_all=False) print("Default Mode: " + "/ ".join(seg_list)) # 精确模式 seg_list = jieba.lcut("他来到了网易杭研大厦") # 默认是精确模式 print(", ".join(seg_list)) print(type(seg_list)) print(seg_list) seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式 print(", ".join(seg_list)) seg_list = jieba.cut("我换不行北京清华大学", cut_all=False) print("Default Mode: " + "/ ".join(seg_list)) # 精确模式 words = pseg.cut("我爱北京天安门") print(words) for word, flag in words: print('%s %s' % (word, flag)) print('分词:默认模式') result = jieba.tokenize(u'永和服装饰品有限公司') for tk in result: print("word %s\t\t start: %d \t\t end:%d" % (tk[0], tk[1], tk[2])) print('分词:搜索模式') result = jieba.tokenize(u'永和服装饰品有限公司', mode='search') for tk in result: print("word %s\t\t start: %d \t\t end:%d" % (tk[0], tk[1], tk[2]))
def test5(): #默认模式 result = jieba.tokenize(u'永和服装饰品有限公司') for tk in result: print('word %s\t\t start:%d \t\t end:%d' %(tk[0], tk[1], tk[2])) print print #搜索模式 result = jieba.tokenize(u'永和服装饰品有限公司', mode='search') for tk in result: print('word %s\t\t start:%d \t\t end:%d' %(tk[0], tk[1], tk[2])) #功能7: chineseAnalyzer for whoosh搜索引擎 # https://github.com/fxsjy/jieba/blob/master/test/test_whoosh.py # 其他词典 # 占用内存较小的词典文件 https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.small # 支持繁体分词更好的词典文件 https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.big # jieba.set_dictionary('data/dict.txt.big')
def main(argv): rawTextInput = 'rawText.txt' argc = len(argv) for i in xrange(argc): if argv[i] == "-i" and i + 1 < argc: rawTextInput = argv[i + 1] elif argv[i] == "-o" and i + 1 < argc: tokenizedFile = argv[i + 1] elif argv[i] == "-map" and i + 1 < argc: mappingFile = argv[i + 1] elif argv[i] == "-offset" and i + 1 < argc: offsetFile = argv[i + 1] with codecs.open(mappingFile, encoding='utf-8', mode='r') as input: for line in input: elements = line.strip().split(',') mapping[elements[1]] = mapping[elements[0]] outputA = codecs.open(offsetFile, encoding='utf-8', mode='w') outputB = codecs.open(tokenizedFile, encoding='utf-8', mode='w') for line in codecs.open(rawTextInput, encoding='utf-8', mode='r'): result = jieba.tokenize(line.strip()) offsets = [] newline = [] for tk in result: tk = tk[0] begin = tk[1] end = tk[2] if tk == ' ': newline.append(' ') continue if tk in punctuations: newline.append(tk) continue tk = ''.join([i for i in tk if not i.isdigit()]).lower() if len(tk) == 0: newline.append(' ') continue if tk not in mapping: newline.append('zzzzzzzzzzz') else: newline.append(mapping[tk]) offsets.append((tk, begin, end)) newline.append(' ') outputA.write(u''.join(newline).encode('utf8')) outputA.write('\n') for (string, begin, end) in offsets.iteritems(): outputB.write(string) outputB.write(',') outputB.write(begin) outputB.write(',') outputB.write(end) outputB.write('\t') outputB.write('\n')
def __call__(self, text, **kargs): words = jieba.tokenize(text, mode="search") token = Token() for (w,start_pos,stop_pos) in words: if not accepted_chars.match(w) and len(w)<=1: continue token.original = token.text = w token.pos = start_pos token.startchar = start_pos token.endchar = stop_pos yield token
def segment_text_desc(word): """ 文本描述切词 :param word: :return: """ key_words = jieba.tokenize(word) for item in key_words: print item return list(key_words)
def dealOnePage(control_obj,dataObj): title = dataObj[1].lower() cur_page_dic = PageDic() seg_list = jieba.tokenize(title,mode = "search") #get segment list for tk in seg_list: ############################################### deal for a word in a page control_obj.addWordIdDic(tk[0]) #add to word to wrodId dictionary cur_page_dic.addPageItem(tk[0],tk[1],tk[2]) #add current word-hits dictionary to an object control_obj.addPageDic(dataObj[0],cur_page_dic.page_dic)
def segment_hanzi(txt): """ Tokenizes Chinese text Args: txt -- Chinese text with Chinese characters in it (unicode) Returns: list of unicode, in which each element is a token of txt """ tokens = jieba.tokenize(txt) tokens_hanzi = [tkn[0] for tkn in tokens] return tokens_hanzi
def tokenize(name, stopwords): # this function tokenize chinese sentences and remove the stopwords try: original_tokens = jieba.tokenize(name) except ValueError: print(name,'not a uni-code') return tokens = [] for term in original_tokens: if term[0] in stopwords: None else: tokens.append(term[0]) return tokens
def mapper_getterm(self,key,comment_list): for comment in comment_list: try: status_text = comment['status']['text'] filtered_status_text = re.sub(r'[^A-Za-z\s]+','',status_text)\ .lower() comment_text = comment['text'] if re.match('michael kors',filtered_status_text): for word in jb.tokenize(unicode(comment_text)): # filter Chinese terms and remove the stopwords cond1 = re.match(ur'[\u4e00-\u9fff]+',word[0]) cond2 = word[0] not in stop_words if cond1 and cond2: yield ('michael',word[0]),1 if re.match('kate spade',filtered_status_text): for word in jb.tokenize(unicode(comment_text)): # filter Chinese terms and remove the stopwords cond1 = re.match(ur'[\u4e00-\u9fff]+',word[0]) cond2 = word[0] not in stop_words if cond1 and cond2: yield ('kate',word[0]),1 except: pass
def mixed_lang_word_count(string): """ Returns the word count of a string containing English and Chinese words. The string is split into English and Chinese, then returns the sum of the word counts from both substrings based on NLTK and Jieba. E.g. '你好 Andrew' returns 2, as '你好' is one word and 'Andrew' is another. :param string: a string containing english and chinese :returns: the word count """ english_only = re.sub(r'\W+', '', string) num_eng_words = len(nltk.word_tokenize(english_only)) non_english_only = re.sub(r'\w+', '', string) num_non_eng_words = len(list(jieba.tokenize(non_english_only.decode('utf-8')))) return num_eng_words + num_non_eng_words
def handle(data): oper = json.loads(data) if oper[0] == 'cut': return json.dumps(tuple(jieba.cut(*oper[1], **oper[2]))).encode('utf-8') elif oper[0] == 'cut_for_search': return json.dumps(tuple(jieba.cut_for_search(*oper[1], **oper[2]))).encode('utf-8') elif oper[0] == 'tokenize': return json.dumps(tuple(jieba.tokenize(*oper[1], **oper[2]))).encode('utf-8') elif oper[0] == 'add_word': jieba.add_word(*oper[1], **oper[2]) elif oper[0] == 'load_userdict': jieba.load_userdict(*oper[1]) elif oper[0] == 'set_dictionary': jieba.set_dictionary(*oper[1]) elif oper[0] == 'stopserver': return b'stop' elif oper[0] == 'ping': return b'pong'
def question3(weiboPostList, chineseFashionTerms): # Tokenize and count terms included in posts about each brand. ignoreMoreTerms = True # Set to 'true' to exclude more common chinese terms. # Dump commonly-occurring tokens and symbols. ignoreTerms = chineseFashionTerms + [" ", "#", "@", ".", "。", "&", "spade", "回复", "【", "的", ",", "/", "]", "[", "!", ":", ":", "�", "~", "~", "`", "、", "】", "a", "t", "c", "h", "!", "cn", "http", ",", "哦", "了", "”", "“", ">", "$"] if ignoreMoreTerms: # Exclude these common Chinese pronouns, particles, and verbs. (and, is, he, she, no, also, etc.) ignoreTerms.extend(["你", "我", "他", "她", "它", "都", "有", "是", "和", "在", "没", "不", "也", "日", "就", "你们", "2015", "会", "为"]) # Exclude other variations on the brand names. for term in ["Michael", "Kate", "Kors", "MK"]: ignoreTerms.append(term) ignoreTerms.append(term.upper()) ignoreTerms.append(term.lower()) for brand in ["Michael Kors", "Kate Spade"]: tokenFrequencies = {} # Create a new dict to store the amount of occurrences of each token. if brand == "Michael Kors": currPostList = filter(lambda x: x.hasKors, weiboPostList) else: currPostList = filter(lambda x: x.hasSpade, weiboPostList) for post in currPostList: postText = unicode(post.text) # encode to unicode (so as to be parseable by jieba) postTokens = jieba.tokenize(postText) # collect tokenization results for that post in postTokens. for token in postTokens: if token[0] in ignoreTerms: continue # Drop ignored terms. if token[0] in tokenFrequencies: # If it already exists in dict, +1 tokenFrequencies[token[0]] += 1 else: # Else, create an entry tokenFrequencies[token[0]] = 1 print print "Token frequency data for", brand+":" for data in sorted(tokenFrequencies.iteritems(), key=lambda tup: tup[1], reverse=True)[:12]: # Printed top 12 here to quickly exclude values like "<?>" ("can't display UTF" character) print "Word:", data[0], "\t", "Frequency:", data[1]
def main(argv): rawTextInput = 'rawText.txt' argc = len(argv) for i in xrange(argc): if argv[i] == "-i" and i + 1 < argc: rawTextInput = argv[i + 1] elif argv[i] == "-o" and i + 1 < argc: tokenizedFile = argv[i + 1] elif argv[i] == "-map" and i + 1 < argc: mappingFile = argv[i + 1] with codecs.open(tokenizedFile, encoding='utf-8', mode='w') as output: id = 0 for line in codecs.open(rawTextInput, encoding='utf-8', mode='r'): result = jieba.tokenize(line.strip()) newline = [] for tk in result: tk = tk[0] if tk == ' ': newline.append(' ') continue if tk in punctuations: newline.append(tk) continue tk = ''.join([i for i in tk if not i.isdigit()]).lower() if len(tk) == 0: newline.append(' ') continue if tk not in mapping: mapping[tk] = id2alpha(id) id += 1 newline.append(mapping[tk]) newline.append(' ') output.write(u''.join(newline).encode('utf8')) output.write('\n') with codecs.open(mappingFile, encoding='utf-8', mode='w') as output: for (string, token) in mapping.iteritems(): output.write(token.encode('utf8')) output.write(',') output.write(string) output.write('\n')
def handlemsg(data): oper = loadsjson(data) if oper[0] == 'c2m': return dumpsjson(mc.c2m.translate(*oper[1:])) elif oper[0] == 'm2c': return dumpsjson(mc.m2c.translate(*oper[1:])) elif oper[0] == 'c2m.raw': return dumpsjson(mc.c2m.rawtranslate(oper[1])) elif oper[0] == 'm2c.raw': return dumpsjson(mc.m2c.rawtranslate(oper[1])) elif oper[0] == 'modelname': return dumpsjson(mc.name()) elif oper[0] == 'cut': return dumpsjson(tuple(jieba.cut(*oper[1], **oper[2]))) elif oper[0] == 'cut_for_search': return dumpsjson(tuple(jieba.cut_for_search(*oper[1], **oper[2]))) elif oper[0] == 'tokenize': return dumpsjson(tuple(jieba.tokenize(*oper[1], **oper[2]))) elif oper[0] == 'jiebazhc.cut': return dumpsjson(tuple(jiebazhc.cut(*oper[1], **oper[2]))) elif oper[0] == 'jiebazhc.cut_for_search': return dumpsjson( tuple(jiebazhc.cut_for_search(*oper[1], **oper[2]))) elif oper[0] == 'jiebazhc.tokenize': return dumpsjson(tuple(jiebazhc.tokenize(*oper[1], **oper[2]))) elif oper[0] == 'add_word': jieba.add_word(*oper[1], **oper[2]) elif oper[0] == 'load_userdict': jieba.load_userdict(*oper[1]) elif oper[0] == 'set_dictionary': jieba.set_dictionary(*oper[1]) elif oper[0] == 'stopserver': return b'stop' elif oper[0] == 'ping': return b'pong' else: return dumpsjson('Command not found')
虽说房价居高不下让许多受访者观望,但是打算今年尽快出手的受访者也不少。“最近去售楼处咨询发现年前的优惠减少了不少,按照这趋势,开发商很有可能涨价,再等估计就更买不起。”黄小姐告诉记者,厦门气候宜居、房源供应不足又深受异地购房者的青睐,房价下跌的可能性非常小。加上今年是落户厦门的最后机会,为了赶上落户的“末班车”,还是尽早稳妥。 """ # jieba.analyse.set_stop_words("stop_words.txt") # # seg_res = jieba.cut(raw_text) # 默认是精确模式 # # seg_list = list(seg_res) # # print(seg_list) # # seg_freq_counter = Counter(seg_list) # # # print(seg_freq_counter) accepted_chars = re.compile(ur"[\u4E00-\u9FA5]+") words = jieba.tokenize(raw_text) seg_list = [] for (w, start_pos, stop_pos) in words: if not accepted_chars.match(w) and len(w) <= 1: continue # @todo 可以使用 isdigit 判断是否为数字 seg_list.append(w) seg_freq_counter = Counter(seg_list) print(seg_freq_counter)
def cuttest(test_sent): global g_mode test_sent = test_sent.decode('utf-8') result = jieba.tokenize(test_sent,mode=g_mode) for tk in result: print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
fileList = [] i = 0 for filename in filenameList: basename = os.path.basename(filename) reader = open(filename) xmldict = xmltodict.parse(reader.read()) reader.close() if xmldict['document']['header']['resultId'] == 'R0000': try: seqid = xmldict['document']['header']['sequenceId'] filedate = xmldict['document']['header']['timestamp'][0:8] article = xmldict['document']['articles']['article'] titledata = xmldict['document']['basicinfo']['title'] postdata = article['topicinfo']['postdata'] dictCutTitle = jiebaToList(jieba.tokenize(titledata,mode='default')) dictCutData = jiebaToList(jieba.tokenize(postdata,mode='default')) posttime = xmldict['document']['articles']['article']['topicinfo']['posttime'] tmpDir = XML_DEST_DIR + filedate cutResult = {'result':'1','posttime':posttime,'cuttitle':dictCutTitle,'cutdata':dictCutData} dictConceptStat = {} listConceptWord = [] for word in dictCutTitle: if dictConceptStat.has_key(word['word']): dictConceptStat[word['word']] = dictConceptStat[word['word']] + 2 else: if dictConceptList.has_key(word['word']): dictConceptStat[word['word']] = 2 for word in dictCutData: if dictConceptStat.has_key(word['word']):
#remove white space, convert to lower letters and remove nonalpha characters comments = [] for comment in array: lower = comment.lower() comments.append(''.join([i for i in lower if i.isalpha()])) #retrieve mentioned Chinese terms associated with each brand from all texts mk = ['michaelkors', 'mk'] ks = ['katespade', 'ks'] mk_dict = {} ks_dict = {} #tokenize text in each weibo post for sentence in comments: #create a counter object to count the occurrence of each term in texts c = Counter() result = jieba.tokenize(sentence) #create a list to store tokenized terms and their frequencies word = [] for tk in result: #print "word %s\t\t start: %d \t\t end:%d" % (tk[0], tk[1], tk[2]) word.append(tk[0]) #update the counter object with new terms c.update(word) #find the number of co-occurrences with mk or ks for every token in all texts for key in c.keys(): for word, count in c.most_common(10): if any(brand in key for brand in mk): mk_dict[word] = count elif any(brand in key for brand in ks): ks_dict[word] = count
def cuttest(test_sent): global g_mode result = jieba.tokenize(test_sent,mode=g_mode,HMM=False) for tk in result: print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
#jieba.enable_parallel(4) seg_list = jieba.cut("我来到北京清华大学",cut_all=True) print "Full Mode:", "/ ".join(seg_list) #全模式 seg_list = jieba.cut("我来到北京清华大学",cut_all=False) print "Default Mode:", "/ ".join(seg_list) #精确模式 seg_list = jieba.cut("他来到了网易杭研大厦") #默认是精确模式 print ", ".join(seg_list) seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") #搜索引擎模式 print ", ".join(seg_list) seg_list = jieba.cut_for_search("李小福是创新办主任也是云计算方面的专家") print ", ".join(seg_list) # load customer dict file jieba.load_userdict("G:\GitHub\MyRepository\Python\TestFiles\user_dict.txt") seg_list = jieba.cut_for_search("李小福是创新办主任也是云计算方面的专家") print ", ".join(seg_list) import jieba.posseg as pseg words =pseg.cut("我爱北京天安门") for w in words: print w.word,w.flag result = jieba.tokenize(u'永和服装饰品有限公司') for tk in result: print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
def tokenize(self, sentence, mode='default', HMM=True): sentence = to_text(sentence) tokens = jieba.tokenize(sentence, mode=mode, HMM=HMM) return list(tokens)
def tokenize(l): words=[] for s in l: words.append(jieba.tokenize(s)) return words