Python Algorithm примеры использования

Язык программирования: Python

Пространство имен/Пакет: mmseg

Метод/Функция: Algorithm

Примеров на hotexamples.com: 13

Python Algorithm - 13 примеров найдено. Это лучшие примеры Python кода для mmseg.Algorithm, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Пример #1

Показать файл

    def modern_chinese_tokenizer(raw_text):
        global TOKENIZER
        if TOKENIZER is not 'Modern':
            # reload mmseg to re-init
            reset_mmseg()
    
            #directory of modern dictionary
            dirname = os.path.dirname(__file__)
            dictionary = os.path.join(dirname, 'modern words.dic')
            mmseg.dict_load_defaults()
            mmseg.Dictionary.load_words(dictionary)
            TOKENIZER = 'Modern'

        # process text
        #print raw_text.encode('utf-8')
        tokenizer = mmseg.Algorithm(raw_text.encode('utf-8-sig'))

        tokens = []
        for token in tokenizer:
            token = token.text.decode('utf-8-sig', errors='replace').replace(u'\x00', '')
            if token:
                if token not in chinese_punctuation:
                    tokens.append(token)

        return tokens

Пример #2

Показать файл

Файл: mymail.py Проект: mgduarte/homework

def extract_mail_feature(m):
    f = {}

    # from addr
    from_ = re.findall(r'<(.*?)>', m['From'])
    if from_:
        from_ = from_[0]
    else:
        from_ = m['From']
    f['FROM:' + from_] = 1

    # subject
    sub = m['Subject'].split('\n')
    if len(sub) > 1:
        sub = '\n'.join(map(escsub, sub))
    else:
        sub = escsub(sub[0])

    is_chinese = not not re.findall('[\x80-\xff].', sub.encode('utf8'))
    if is_chinese:
        words = filter(lambda i: i not in PUNC,
                       [unicode(i) for i in mmseg.Algorithm(sub)])
    else:
        words = sub.split()
    for w in words:
        f[w] = f.get(w, 0) + 1

    return f

Пример #3

Показать файл

    def __call__(self, str):
        status = Status.wrap(str)
        content = status.get_content()

        algor = mmseg.Algorithm(content)
        tokens = map(lambda x: x.text, algor)

        # append the emo and topic
        for e in status.get_emos():
            algor = mmseg.Algorithm(e)
            tokens.extend(map(lambda x: x.text, algor))

        for t in status.get_topics():
            algor = mmseg.Algorithm(t)
            tokens.extend(map(lambda x: x.text, algor))

        return [x for x in tokens if x not in self.stopwords]

Пример #4

Показать файл

Файл: pre_process.py Проект: ddmkchan/winterfall

def feature_count():
    corpus = []
    with open('%s/restaurant.txt' % PROJECT_ROOT) as f:
        for line in f.readlines():
            id, restaurant = line.rstrip().split('\t')
            corpus.append([
                i.text for i in mmseg.Algorithm(restaurant.decode('utf-8'))
                if len(i.text) >= 2
            ])
    dictionary = corpora.Dictionary(corpus)
    _dict = {}
    for token, id in dictionary.token2id.iteritems():
        _dict[id] = token
    _s = sorted(dictionary.dfs.iteritems(), key=lambda d: d[1], reverse=True)
    for i in _s[:100]:
        print _dict.get(i[0]), i[1]

Пример #5

Показать файл

Файл: __init__.py Проект: picksomething/autocomplete-redis

    def prefixs_for_term(self, term):
        """
    Get prefixs for TERM.
    """
        # Normalization
        term = term.lower()

        # Prefixs for term
        prefixs = []
        tokens = mmseg.Algorithm(term)
        for token in tokens:
            word = token.text
            for i in xrange(1, len(word) + 1):
                prefixs.append(word[:i])

        return prefixs

Пример #6

Показать файл

def Segment(s):
    """
  Given a unicode string performs Chinese segmentation.

  Result is a list of unicode strings, each being one "segment". Nte
  that the underlying segmented will ocasionally throw out bits of text
  (particularly punctuation). This wrapper will preserve these
  substrings by including them as distinct "segments".
  """
    assert type(s) is unicode
    s = s.encode('utf-8')
    tokens = mmseg.Algorithm(s)
    result = []
    pos = 0
    for token in tokens:
        if token.start > pos:
            result.append(s[pos:token.start].decode('utf-8'))
        result.append(token.text.decode('utf-8'))
        pos = token.end
    if pos < len(s):
        result.append(s[pos:].decode('utf-8'))
    return result

Пример #7

Показать файл

import gensim, mmseg, os, codecs
from collections import defaultdict
frequency = defaultdict(int)
kstm_base = '/home/chris/00scratch/kansekitm'
corpus_base = '%s/corpus/zztj' % (kstm_base)
t = mmseg.Dictionary()
t.load_chars('%s/dic/chars.dic' % (kstm_base))
t.load_words('%s/dic/words.dic' % (kstm_base))
files = os.listdir(corpus_base)
files.sort()
of = codecs.open("%s/out.txt" % (corpus_base), "w", "utf-8")
for f in files:
    if not f.startswith("zztj"):
        continue
    of.write("# file: %s\n" % (f))
    print "%s/%s" % (corpus_base, f)
    for line in codecs.open("%s/%s" % (corpus_base, f), 'r', 'utf-8'):
        if line[0] in ['*', '#']:
            continue
        l_out = []
        for l in line.split():
            if "@" in l:
                l_out.append(l.split('@')[-1])
            else:
                algor = mmseg.Algorithm(l)
                l_out.extend([tok.text for tok in algor])
        of.write("%s\n" % (" ".join(l_out)))
        for token in l_out:
            frequency[token] += 1
of.close()

Пример #8

Показать файл

def wordseg(text):
    if isinstance(text, unicode):
        text = text.encode('utf-8')
    algor = mmseg.Algorithm(text)
    return list(algor)

Пример #9

Показать файл

Файл: test.py Проект: zhiyue/binux-tools

# coding: utf-8
import mmseg
#mmseg.dict_load_defaults()
#mmseg.dict_load_words("data/words.dic")
mmseg.dict_load_words("../data/remove_list.dic")
while True:
    a = raw_input()
    for tk in mmseg.Algorithm(a.decode("utf-8").encode("utf-8")):
        print tk.text, repr(tk.text), tk.attr

Пример #10

Показать файл

Файл: pre_process.py Проект: ddmkchan/winterfall

    with open('/home/chenyanpeng/food2') as f:
        for l in f.readlines():
            try:
                segs = l.rstrip().split('\t')
                if segs[0] not in _dict:
                    _dict[segs[0]] = [segs[1]]
                else:
                    _dict[segs[0]].append(segs[1])
            except Exception, e:
                print str(e), l.rstrip()
    count = 0
    for id, v in _dict.iteritems():
        words = ",".join(v)
        r = list(
            set([
                i.text for i in mmseg.Algorithm(words.decode('utf-8'))
                if len(i.text) >= 2 and re.search('\d+', i.text) is None
            ]))
        #r = ",".join(list(set([i.text for i in mmseg.Algorithm(words.decode('utf-8')) if len(i.text)>=2])))
        #print id, ",".join(list(set([i.text for i in mmseg.Algorithm(words.decode('utf-8')) if len(i.text)>=2 and re.search('\d+', i.text) is None])))
        item = EleFoodSegment(**{'id': id, "segments": json.dumps(r)})
        db_conn.merge(item)
        count += 1
        if count % 5000 == 0:
            print "%s \t commit" % count
            db_conn.commit()
    db_conn.commit()


def basic_categorize():
    #设定规则，根据餐厅名进行简单的菜系分类

Пример #11

Показать файл

# -*- coding=utf-8 -*-
import mmseg

mmseg.dict_load_defaults()
subject = "linux兼容硬件列表及笔记本usb摄头配置推荐"
algor = mmseg.Algorithm(subject)
for tk in algor:
    print tk.text

Пример #12

Показать файл

Файл: __init__.py Проект: picksomething/autocomplete-redis

 def normalize(self, prefix):
     """
 Normalize the search string.
 """
     tokens = mmseg.Algorithm(prefix.lower())
     return [token.text for token in tokens]

Пример #13

Показать файл

 def __call__(self, text, must_chinese=True):
     algor = mmseg.Algorithm(text)
     words = [unicode(x.text) for x in algor]
     if must_chinese:
         words = filter(lambda x: re.match(u'^[\u4e00-\u9fa5]+$', x), words)
     return words