def modern_chinese_tokenizer(raw_text): global TOKENIZER if TOKENIZER is not 'Modern': # reload mmseg to re-init reset_mmseg() #directory of modern dictionary dirname = os.path.dirname(__file__) dictionary = os.path.join(dirname, 'modern words.dic') mmseg.dict_load_defaults() mmseg.Dictionary.load_words(dictionary) TOKENIZER = 'Modern' # process text #print raw_text.encode('utf-8') tokenizer = mmseg.Algorithm(raw_text.encode('utf-8-sig')) tokens = [] for token in tokenizer: token = token.text.decode('utf-8-sig', errors='replace').replace(u'\x00', '') if token: if token not in chinese_punctuation: tokens.append(token) return tokens
def extract_mail_feature(m): f = {} # from addr from_ = re.findall(r'<(.*?)>', m['From']) if from_: from_ = from_[0] else: from_ = m['From'] f['FROM:' + from_] = 1 # subject sub = m['Subject'].split('\n') if len(sub) > 1: sub = '\n'.join(map(escsub, sub)) else: sub = escsub(sub[0]) is_chinese = not not re.findall('[\x80-\xff].', sub.encode('utf8')) if is_chinese: words = filter(lambda i: i not in PUNC, [unicode(i) for i in mmseg.Algorithm(sub)]) else: words = sub.split() for w in words: f[w] = f.get(w, 0) + 1 return f
def __call__(self, str): status = Status.wrap(str) content = status.get_content() algor = mmseg.Algorithm(content) tokens = map(lambda x: x.text, algor) # append the emo and topic for e in status.get_emos(): algor = mmseg.Algorithm(e) tokens.extend(map(lambda x: x.text, algor)) for t in status.get_topics(): algor = mmseg.Algorithm(t) tokens.extend(map(lambda x: x.text, algor)) return [x for x in tokens if x not in self.stopwords]
def feature_count(): corpus = [] with open('%s/restaurant.txt' % PROJECT_ROOT) as f: for line in f.readlines(): id, restaurant = line.rstrip().split('\t') corpus.append([ i.text for i in mmseg.Algorithm(restaurant.decode('utf-8')) if len(i.text) >= 2 ]) dictionary = corpora.Dictionary(corpus) _dict = {} for token, id in dictionary.token2id.iteritems(): _dict[id] = token _s = sorted(dictionary.dfs.iteritems(), key=lambda d: d[1], reverse=True) for i in _s[:100]: print _dict.get(i[0]), i[1]
def prefixs_for_term(self, term): """ Get prefixs for TERM. """ # Normalization term = term.lower() # Prefixs for term prefixs = [] tokens = mmseg.Algorithm(term) for token in tokens: word = token.text for i in xrange(1, len(word) + 1): prefixs.append(word[:i]) return prefixs
def Segment(s): """ Given a unicode string performs Chinese segmentation. Result is a list of unicode strings, each being one "segment". Nte that the underlying segmented will ocasionally throw out bits of text (particularly punctuation). This wrapper will preserve these substrings by including them as distinct "segments". """ assert type(s) is unicode s = s.encode('utf-8') tokens = mmseg.Algorithm(s) result = [] pos = 0 for token in tokens: if token.start > pos: result.append(s[pos:token.start].decode('utf-8')) result.append(token.text.decode('utf-8')) pos = token.end if pos < len(s): result.append(s[pos:].decode('utf-8')) return result
import gensim, mmseg, os, codecs from collections import defaultdict frequency = defaultdict(int) kstm_base = '/home/chris/00scratch/kansekitm' corpus_base = '%s/corpus/zztj' % (kstm_base) t = mmseg.Dictionary() t.load_chars('%s/dic/chars.dic' % (kstm_base)) t.load_words('%s/dic/words.dic' % (kstm_base)) files = os.listdir(corpus_base) files.sort() of = codecs.open("%s/out.txt" % (corpus_base), "w", "utf-8") for f in files: if not f.startswith("zztj"): continue of.write("# file: %s\n" % (f)) print "%s/%s" % (corpus_base, f) for line in codecs.open("%s/%s" % (corpus_base, f), 'r', 'utf-8'): if line[0] in ['*', '#']: continue l_out = [] for l in line.split(): if "@" in l: l_out.append(l.split('@')[-1]) else: algor = mmseg.Algorithm(l) l_out.extend([tok.text for tok in algor]) of.write("%s\n" % (" ".join(l_out))) for token in l_out: frequency[token] += 1 of.close()
def wordseg(text): if isinstance(text, unicode): text = text.encode('utf-8') algor = mmseg.Algorithm(text) return list(algor)
# coding: utf-8 import mmseg #mmseg.dict_load_defaults() #mmseg.dict_load_words("data/words.dic") mmseg.dict_load_words("../data/remove_list.dic") while True: a = raw_input() for tk in mmseg.Algorithm(a.decode("utf-8").encode("utf-8")): print tk.text, repr(tk.text), tk.attr
with open('/home/chenyanpeng/food2') as f: for l in f.readlines(): try: segs = l.rstrip().split('\t') if segs[0] not in _dict: _dict[segs[0]] = [segs[1]] else: _dict[segs[0]].append(segs[1]) except Exception, e: print str(e), l.rstrip() count = 0 for id, v in _dict.iteritems(): words = ",".join(v) r = list( set([ i.text for i in mmseg.Algorithm(words.decode('utf-8')) if len(i.text) >= 2 and re.search('\d+', i.text) is None ])) #r = ",".join(list(set([i.text for i in mmseg.Algorithm(words.decode('utf-8')) if len(i.text)>=2]))) #print id, ",".join(list(set([i.text for i in mmseg.Algorithm(words.decode('utf-8')) if len(i.text)>=2 and re.search('\d+', i.text) is None]))) item = EleFoodSegment(**{'id': id, "segments": json.dumps(r)}) db_conn.merge(item) count += 1 if count % 5000 == 0: print "%s \t commit" % count db_conn.commit() db_conn.commit() def basic_categorize(): #设定规则,根据餐厅名进行简单的菜系分类
# -*- coding=utf-8 -*- import mmseg mmseg.dict_load_defaults() subject = "linux兼容硬件列表及笔记本usb摄头配置推荐" algor = mmseg.Algorithm(subject) for tk in algor: print tk.text
def normalize(self, prefix): """ Normalize the search string. """ tokens = mmseg.Algorithm(prefix.lower()) return [token.text for token in tokens]
def __call__(self, text, must_chinese=True): algor = mmseg.Algorithm(text) words = [unicode(x.text) for x in algor] if must_chinese: words = filter(lambda x: re.match(u'^[\u4e00-\u9fa5]+$', x), words) return words