Exemplos de seg_txt em Python, exemplos de mmseg.seg_txt em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: tests.py Projeto: lxdiyun/haystack_demo

def mmseg_test():
    string = "最主要 的更  动是：张无忌最后没有选定自己的配偶。自己的自己"
    print(seg_txt(string))
    output = ""
    for i in seg_txt(string):
        output += i + " "
    print(output)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: text.py Projeto: DeronW/breadtrip_lib_py

def get_chinese_similarity(s1, s2):
    """
    Get the similarity of two chinese word
    """
    hash1 = simhash([ smart_unicode(x) for x in seg_txt(smart_str(s1)) ])
    hash2 = simhash([ smart_unicode(x) for x in seg_txt(smart_str(s2)) ])
    return hash1.similarity(hash2)

Exemplo n.º 3

0

Exibir arquivo

Arquivo: doc_freq_merge.py Projeto: sunyuping/oce

  def Classify(self, text):
    text = re.sub(r'\d', r' ', text)
    text_words = [w for w in mmseg.seg_txt(text)]
    #print text_words
    category = -1
    max_weight = 0.0
    best_unknown_weight = 0.0

    for cat_id, word_weights in self.cat_word_weight_.items():
      #print '---------------------------'
      weight = 0.0
      unknown_weight = 0.0
      for word in text_words:
        if word in word_weights:
          w = word_weights[word] * (3 ** ((len(word) - 1) / 3))
          weight += w
          unknown_weight -= len(word) * 0.6
          #print word, w
        else:
          if word not in self.stop_words_:
            unknown_weight += len(word) * 1.0
            #print word, 'unknown'
          pass

      if weight > max_weight and unknown_weight < 0.0:
        max_weight = weight
        category = cat_id
        best_unknown_weight = unknown_weight
 
    # print text, ':', category, self.cat_id_name_map_[category], max_weight, best_unknown_weight
    return self.cat_id_name_map_[category]

Exemplo n.º 4

0

Exibir arquivo

Arquivo: redis.py Projeto: yamingd/play

 def put(self,title,item_id):
     """
     title --> segment --> sadd(phrase,item_id) -> zadd (phrase->prefix, suffix, 0)
                       --> pinyin --> sadd(phrase,item_id) -> zadd (phrase->prefix, suffix, 0)
     """
     if not title or not item_id:
         return
     
     for phrase in mmseg.seg_txt(title.encode('utf8')):
         if not phrase:
             continue
         phrase = phrase.decode('utf8')
         self._add_phrase(chinese_key(phrase),item_id)
         for (key,suffix,score) in self._gen_suffix(phrase):
             self._add_suffix(key,chinese_key(suffix),score)
         
         if not self.pinyin:
             continue
         
         phrase = self.pinyin.translate(phrase)
         if not phrase:
             continue
         
         for sub_phrase in self._gen_pinyin_phrase(phrase):
             self._add_phrase(sub_phrase,item_id)
             for (key,suffix,score) in self._gen_suffix(re.sub('\\s+','',sub_phrase)):
                 self._add_suffix(key,suffix,score)

Exemplo n.º 5

0

Exibir arquivo

def train(filename, parser):
    fname = basename(filename)
    cache_path = join(CACHE_PATH, fname)
    if exists(cache_path):
        return

    word2tag_count = {}
    for tag_id_list, txt in parser(filename):
        if not txt.strip():
            continue

        tag_id_set = set(tag_id_list)
        if not tag_id_set:
            continue

        for tid in tuple(tag_id_set):
            tag_id_set.update(PTAG.get(tid, ()))

        word2count = defaultdict(int)
        word_list = list(seg_txt(utf8_ftoj(str(txt))))
        for i in word_list:
            word2count[i] += 1

        for k, v in word2count.iteritems():
            if k not in word2tag_count:
                word2tag_count[k] = {}
            t = word2tag_count[k]
            for id in tag_id_set:
                if id not in t:
                    t[id] = 0
                t[id] += (1+log(float(v)))

    tofromfile.tofile(cache_path, word2tag_count)

Exemplo n.º 6

0

Exibir arquivo

Arquivo: tokenizer.py Projeto: linecong/model_train

	def remove_stop_words(self,text):
		tokens=mmseg.seg_txt(text)
		left_words=[]
		for t in tokens:
			if t not in self.stopwords:
				left_words.append(t)
		return "".join(left_words)

Exemplo n.º 7

0

Exibir arquivo

def train(filename, parser):
    fname = basename(filename)
    cache_path = join(CACHE_PATH, fname)
    if exists(cache_path):
        return

    word2tag_count = {}
    for tag_id_list, txt in parser(filename):
        if not txt.strip():
            continue

        tag_id_set = set(tag_id_list)
        if not tag_id_set:
            continue

        for tid in tuple(tag_id_set):
            tag_id_set.update(PTAG.get(tid, ()))

        word2count = defaultdict(int)
        word_list = list(seg_txt(utf8_ftoj(str(txt))))
        for i in word_list:
            word2count[i] += 1

        for k, v in word2count.iteritems():
            if k not in word2tag_count:
                word2tag_count[k] = {}
            t = word2tag_count[k]
            for id in tag_id_set:
                if id not in t:
                    t[id] = 0
                t[id] += (1 + log(float(v)))

    tofromfile.tofile(cache_path, word2tag_count)

Exemplo n.º 8

0

Exibir arquivo

Arquivo: redis.py Projeto: yamingd/play

 def suggest(self,phrase,start=1,limit=10,namespace='',expires=600):
     temp = re.split('\s+',phrase.strip())
     phrase = [item for item in mmseg.seg_txt(phrase.encode('utf8'))]
     phrase.extend(temp)
     phrase = map(chinese_key,phrase)
     start = (start-1)*limit
     result_key = 'ac-suggest:' + '|'.join(phrase)
     results = self.r.zrevrange(result_key,start,start+limit-1)
     if results:
         return results
     
     prefix = self.suffix_key_prefix + self.namespace
     prefix_len = len(prefix)
     phrase_keys = []
     for sub_phrase in phrase:
         key =  prefix + sub_phrase
         results = self._suggest(key, limit)
         # strip the prefix off the keys that indicated they matched a lookup
         cleaned_keys = map(lambda x: x[prefix_len:], results)
         cleaned_keys = map(lambda x: self.phrase_key_prefix + self.namespace+x, cleaned_keys)
         phrase_keys.extend(cleaned_keys)
     
     if not phrase_keys:
         return []
     #union all
     num = self.r.zinterstore(result_key,list(set(phrase_keys)))
     self.r.expire(result_key,expires)
     #results
     results = self.r.zrevrange(result_key,start,start+limit-1)
     return results

Exemplo n.º 9

0

Exibir arquivo

Arquivo: __init__.py Projeto: liuzhida/redis-search-py

def split_words(text):
    """docstring for split_words"""
    
    words = []
    for i in seg_txt(text):
        words.append(i)

    return words

Exemplo n.º 10

0

Exibir arquivo

Arquivo: doc_freq.py Projeto: sunyuping/oce

def GetTermsFrequency(text):
    ret = {}
    for w in mmseg.seg_txt(text):
        w = w.strip()
        if len(w) > 0:
            ret.setdefault(w, 0)
            ret[w] += 1
    return ret

Exemplo n.º 11

0

Exibir arquivo

Arquivo: doc_freq.py Projeto: bradenwu/oce

def GetTermsFrequency(text):
  ret = {}
  for w in mmseg.seg_txt(text):
    w = w.strip()
    if len(w) > 0:
      ret.setdefault(w, 0)
      ret[w] += 1
  return ret

Exemplo n.º 12

0

Exibir arquivo

Arquivo: chinese_test.py Projeto: mapix/redis-completion

def store_movie(movie):
    phrase = movie["title"]
    seg_phrase = " ".join(mmseg.seg_txt(phrase))
    _pinyin_phrase = pinyin.get_pinyin(phrase)
    py_phrase = "".join([p[0] for p in _pinyin_phrase]).encode("utf-8")
    pinyin_phrase = "".join(_pinyin_phrase).encode("utf-8")
    phrase = "%s %s %s %s" % (phrase, seg_phrase, pinyin_phrase, py_phrase)
    engine.store_json(movie["id"], phrase, movie)

Exemplo n.º 13

0

Exibir arquivo

 def tf_idf(self, txt):
     tf = defaultdict(int)
     for i in seg_txt(str(txt.lower())):
         tf[i] += 1
     result = []
     for k, v in tf.iteritems():
         if k in self._idf:
             result.append((k, v*self._idf[k]))
     return result

Exemplo n.º 14

0

Exibir arquivo

 def tf_idf(self, txt):
     tf = defaultdict(int)
     for i in seg_txt(str(txt.lower())):
         tf[i] += 1
     result = []
     for k, v in tf.iteritems():
         if k in self._idf:
             result.append((k, v * self._idf[k]))
     return result

Exemplo n.º 15

0

Exibir arquivo

Arquivo: generate_segmented_ordered_content_file.py Projeto: wsgan001/Social-influence-estimation-based-on-BPR

 def generate_segmented_content_file(self):
     my_file = file('ordered_segmented_content_file.txt', 'w')
     with open('ordered_content_file.txt') as f:
         for line in f:
             print "正在对第{0}行进行分词操作……".format(self.count)
             for segment in seg_txt(line):
                 my_file.write(segment + ' ')
             my_file.write('\n')
             self.count += 1
     my_file.close()

Exemplo n.º 16

0

Exibir arquivo

Arquivo: instant_search.py Projeto: chenchiyuan/instant_search

 def parse(self, words):
     words = SearchIndex.__to_unicode(words)
     _seg_words = [word for word in seg_txt(words)]
     seg_words = filter(None, _seg_words)
     results = []
     for word in seg_words:
         word_utf8 = SearchIndex.__to_unicode(word)
         decode_word = unidecode(word_utf8)
         key = self.cache_key_prefix + slugify(decode_word)
         results.append(key)
     return results

Exemplo n.º 17

0

Exibir arquivo

Arquivo: document.py Projeto: unasogno/bookstore

  def get_terms(self):
    values = []
    for field in self._fields:
      values.append(self._data[field].encode('utf8'))

    text = ' '.join(values)

    terms = []
    for term in seg_txt(text):
      terms.append(term.decode('utf8'))

    return terms

Exemplo n.º 18

0

Exibir arquivo

Arquivo: xaql.py Projeto: yamingd/play

def gen_terms(cont):
    if cont is None:
        return []
    cont = cont.strip()
    if len(cont)==0:
        return []
    if len(cont)<TERM_MIN_LENGTH:
        return []
    terms = [item for item in seg_txt(cont) if len(item)>TERM_MIN_LENGTH]
    if len(cont)<10:
        terms.append(cont)
    terms = list(set(terms))
    return terms

Exemplo n.º 19

0

Exibir arquivo

Arquivo: perf_mmseg.py Projeto: zhimingz/gkseg

def main():
    count = 0
    start = time.time()
    for rawfile in listdir('tests/text'):
        text = '\n'.join(codecs.open(rawfile, 'r', 'utf-8').readlines())
        wds = mmseg.seg_txt(text.encode('utf-8'))
        o = codecs.open(os.path.join('tests/temp', os.path.basename(rawfile)), 'w', 'utf-8')
        o.write((' '.join(wds)).decode('utf-8'))
        o.close()
        count = count + 1
    print '---------------------------------------------------------------'
    print time.time() - start
    print count
    print '---------------------------------------------------------------'

Exemplo n.º 20

0

Exibir arquivo

Arquivo: DataProcess.py Projeto: NaturalL/ReadWeibo

def generate_user_dict(w_uid):
	user = Account.objects.get(w_uid=w_uid)
	wbs = user.watchweibo.all()
	wordset = Set()
	
	print 'Generating user dict with %d weibo to deal with' % len(wbs)
	
	for wb in wbs:
		for word in seg_txt(wb.text.encode('utf-8','ignore')):
			if len(word)>3:
				wordset.add(word.lower().strip())

	with open("../data/user_dict/%s.dic" % w_uid, "w") as dic_file:
		for word in wordset:
			dic_file.write("%s\n" % word)

Exemplo n.º 21

0

Exibir arquivo

Arquivo: emailTrash.py Projeto: sunyueyin/smsTrash

 def input_raw(self, sentence, is_spam):
     """
     训练数据
     :param sentence: 训练的句子
     :param is_spam: 是否时垃圾短信
     :return:
     """
     sms = mmseg.seg_txt(sentence)
     sms = list(sms)
     for flag, word in enumerate(sms):
         offset = 0 if is_spam else 1
         if word not in self.sms_value:
             self.sms_value[word] = [1 - offset, offset]
         else:
             self.sms_value[word][offset] += 1
         self.sms_count[offset] += 1

Exemplo n.º 22

0

Exibir arquivo

Arquivo: utils.py Projeto: waylybaye/solog

def segment(string):
#    alphas = ''
#    unicode = ''
#    last_is_alpha = False
#    for char in string:
#        if char.isalpha():
#            if not last_is_alpha:
#                alphas += ' '
#            alphas += char
#        else:
#            if last_is_alpha:
#                unicode += ' '
#            unicode += char
#    print "ALPHAS", alphas
#    print "UNICODE", unicode[:20]
#    return alphas + u' '.join([ txt.decode('utf8') for txt in seg_txt( unicode.encode('utf8')) ])
    return u' '.join([ txt.decode('utf8') for txt in seg_txt( string.encode('utf8')) ])

Exemplo n.º 23

0

Exibir arquivo

Arquivo: bayes_on_redis.py Projeto: hyqer/bayes_on_redis

    def count_occurance(self, text=''):
        if not isinstance(text, basestring):
            raise Exception("input must be instance of String")

        separated_by_non_alphanumerics = text.replace('/',' ').replace('\\',' ').replace('>',' ').replace('<',' ').lower()
        #print separated_by_non_alphanumerics
        without_one_or_two_words = self.__class__.one_or_two_words_re.sub('', separated_by_non_alphanumerics)
        without_dots = without_one_or_two_words.replace(".", "")
        text_chunks = self.stopwords.to_re().sub('', without_dots).split()
        
        frequencies = {}
        for word in text_chunks:
            seg = mmseg.seg_txt(word)
            for s in seg:
                frequencies[s] = (frequencies[s] if frequencies.has_key(s) else 0) + 1

        return frequencies

Exemplo n.º 24

0

Exibir arquivo

Arquivo: models.py Projeto: chenchiyuan/zoneke

    def create_action(self):
        cache_key = "WEIBO:HOT:%s" %self.user.sns_id
        cache.delete(cache_key)

        tmp_cache_key = "TEMP:WEIBO:HISTORY:%s:::" %self.user.sns_id

        weibo_history = self.user.weibo_history

        for text in weibo_history:
            terms = seg_txt(text.encode('utf-8'))
            for term in terms:
                index_key = '%s%s' %(BASIC_TAG_PREFIX, term)
                if cache.exists(index_key):
                    key = tmp_cache_key + term.decode('utf-8')
                    cache.incr(name=key, amount=1)

        keys = cache.keys(pattern="%s*" %tmp_cache_key)

        for key in keys:
            name = key.split(":::")[1]
            value = float(cache.get(key))
            cache.zadd(cache_key, value, name)
            cache.delete(key)

            tag = BasicTag.get_by_name(name=name)
            if not tag:
                continue

            relations = tag.friends
            score = tag.score

            for f in relations:
                items = f.split(':::')
                obj_name = items[0]
                obj_value = float(items[1])
                result = obj_value/50*value
                cache.zadd(cache_key, result, obj_name)

        results = cache.zrevrange(name=cache_key, start=0, num=30, withscores=True)
        tags = [result[0].decode('utf-8') +'__' + str(result[1]) for result in results]

        self.user.update(set__tags=tags)

Exemplo n.º 25

0

Exibir arquivo

Arquivo: redis.py Projeto: yamingd/play

    def remove(self,title,item_id):
        if not title or not item_id:
            return
        
        for phrase in mmseg.seg_txt(title.encode('utf8')):
            if not phrase:
                continue
            
            phrase = phrase.decode('utf8')
            self._rem_phrase(chinese_key(phrase),item_id)

            if not self.pinyin:
                continue
            
            phrase = self.pinyin.translate(phrase)
            if not phrase:
                continue
            
            for sub_phrase in self._gen_pinyin_phrase(phrase):
                self._rem_phrase(sub_phrase,item_id)

Exemplo n.º 26

0

Exibir arquivo

Arquivo: search.py Projeto: unasogno/bookstore

def get(url, headers, body):
  query = headers.get('QUERY')
  if query == None or query.strip() == '':
    return 400, 'Bad Request', 'query field is not found.', None
  params = dict((n,v) for n, v in (i.split('=', 1) for i in query.split('&')))
  if 'query' not in params:
    return 400, 'Bad Request', 'query field is not found.', None
  text = params['query']
  search_query = helpers.decode_urlencoding(text)
  # helpers.log_search_query(search_query)
  global logger
  logger.debug('incoming query: %s', text)

  terms = seg_txt(search_query)

  logger.debug('terms from query: %s', terms)

  database = xapian.Database('../indexes/')
  enquire = xapian.Enquire(database)

  l = []
  for term in terms:
    l.append(term)

  q = xapian.Query(xapian.Query.OP_OR, l)

  enquire.set_query(q)
  matches = enquire.get_mset(0, 100)

  print '%i results found.' % matches.get_matches_estimated()
  print 'Result - %i:' % matches.size()

  r = []
  for m in matches:
    # print '%i: %i%% docid=%i [%s]' % (m.rank + 1, m.percent, m.docid,\
    # m.document.get_data())
    r.append(m.document.get_data())

  print json.dumps(r)

  return 200, 'OK', json.dumps(r), None

Exemplo n.º 27

0

Exibir arquivo

Arquivo: emailTrash.py Projeto: sunyueyin/smsTrash

    def predict(self, sentence):
        def is_zero(value):
            return value if value > 0 else 0.01

        sms = mmseg.seg_txt(sentence)
        sms = set(sms)
        sms_prob_ham = sms_prob_spam = 1
        for flag, word in enumerate(sms):
            word_prob_spam = word_prob_ham = 0
            if word in self.sms_value:
                value = self.sms_value[word]
                word_prob_spam = float(
                    value[0]) / self.sms_count[0]  # 这个词为spam的概率
                word_prob_ham = float(
                    value[1]) / self.sms_count[1]  # 这个词为healthy的概率
            word_prob_spam = is_zero(word_prob_spam)
            word_prob_ham = is_zero(word_prob_ham)
            ##计算
            prob_is_spam = word_prob_spam / (word_prob_spam + word_prob_ham
                                             )  # 其中word_prob_ham为补集
            sms_prob_spam *= prob_is_spam
            sms_prob_ham *= (1 - prob_is_spam)
        return sms_prob_spam / (sms_prob_spam + sms_prob_ham)

Exemplo n.º 28

0

Exibir arquivo

Arquivo: tokenizer.py Projeto: linecong/model_train

	def normalize_syn_words(self,text):
		tokens=mmseg.seg_txt(text)
		word_list=[x for x in tokens]
		wlist_len=len(word_list)
		for i in xrange(wlist_len):
			if word_list[i] == "":
				continue
			curr_len=0
			j = i
			while j < wlist_len:
				curr_len+=len(word_list[j])
				if curr_len > self.max_len_to_replace:
					break
				j+=1
			while j > i:
				wrf="".join(word_list[i:j])
				if wrf in self.replace_dict:
					wrt=self.replace_dict[wrf]
					word_list[i] = wrt
					for k in xrange(i+1,j):
						word_list[k] = ""
					break
				j-=1
		return "".join(word_list)

Exemplo n.º 29

0

Exibir arquivo

Arquivo: doc_freq_merge.py Projeto: gunner14/old_rr_code

  def Classify(self, text):
    text = re.sub(r'\d', r' ', text)
    text_words = [w for w in mmseg.seg_txt(text)]
    #print text_words
    category = -1
    max_weight = 0.0
    best_unknown_weight = 0.0

    for cat_id, word_weights in self.cat_word_weight_.items():
      #print '---------------------------'
      weight = 0.0
      unknown_weight = 0.0
      for word in text_words:
        if len(word.strip()) == 0:
          continue
        if word in word_weights:
          w = word_weights[word] * (3 ** ((len(word) - 1) / 3))
          weight += w
          unknown_weight -= len(word) * 0.87
          #print word, w
        else:
          if word not in self.stop_words_:
            if not word[0].isalpha(): # 忽略掉不认识的英文单词
              unknown_weight += len(word) * 1.0
            #print word, 'unknown'
          else:
            #print word, 'stop word'
            pass
      #print 'unknown_weight', unknown_weight
      if weight > max_weight and unknown_weight < 0.0:
        max_weight = weight
        category = cat_id
        best_unknown_weight = unknown_weight
 
    # print text, ':', category, self.cat_id_name_map_[category], max_weight, best_unknown_weight
    return self.cat_id_name_map_[category]

Exemplo n.º 30

0

Exibir arquivo

def txt2word(txt):
    return seg_txt(utf8_ftoj(str(txt.lower())))

Exemplo n.º 31

0

Exibir arquivo

Arquivo: test.py Projeto: yangtiangithub/aliceCN

#encoding=utf-8
import mmseg
#from pymmseg import mmseg
#mmseg.dict_load_defaults()

f = open('MMSEGoutput.txt', 'w')
input = open('testinput.txt')
while True:
    text = input.readline()
    for i in mmseg.seg_txt(text):
        print >> f, i, ' ',
    #f.write(testseg)
    print >> f
    if len(text) == 0:
        break
f.flush()
f.close()
input.close()
#f=open('1.txt','w')
#for i in mmseg.seg_txt(text):
#print >>f,i
#algor = mmseg.Algorithm(text)
#for tok in algor:
#print >>f,'%s [%d..%d]' % (tok.text, tok.start, tok.end)
#print '%s' % tok.text

Exemplo n.º 32

0

Exibir arquivo

#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
main.py
Author: WooParadog
Email:  [email protected]

Created on
2011-11-13
'''

import mmseg
import mmseg.search

f = open('text')
dic = {}
for word in mmseg.seg_txt(f.read()):
    if word in dic.keys():
        dic[word] = int(dic[word] + 1)
    else:
        dic[word] = 1
f.close()

import operator
word = sorted(dic.iteritems(), key=operator.itemgetter(1), reverse=True)
print word
f = open('word', 'w')
f.writelines([str(k) + ":" + str(v) + "\n" for (k, v) in word])
f.close()

Exemplo n.º 33

0

Exibir arquivo

Arquivo: fc.py Projeto: tuling56/Python

    n = 0
    for name in names:
        f = os.path.join(dir, name)
        
        print '\nFile: ', f, '...'
        nout = name + '.txt'
        if os.path.exists(nout):
            print '-- SKIPPED'
            continue
        
        fout = open(nout, 'w')

        subject, text = read_eml(f)
#        words = fc(subject)
        words = seg_txt(subject)
        fout.write('{}\n\n'.format(' '.join(words)))
        
        lines = text.splitlines()
        for line in lines:
            #text = '感谢您关注语言云，您的语言云账号已经激活。这封邮件包含您调用语言云服务时使用的token，以及一些其他帮助您快速使用语言云的信息。'
            line = line.strip()
            # print '[', line, ']'
            if line <> '':
                #words = fc(line)
                words = seg_txt(line)
                fout.write(' '.join(words) + '\n')
                for w in words:
                    print w
                    
        fout.close()

Exemplo n.º 34

0

Exibir arquivo

 def append(self, txt):
     for i in set(seg_txt(str(txt.lower()))):
         self._idf[i] += 1
     self._count += 1

Exemplo n.º 35

0

Exibir arquivo

Arquivo: DataProcess.py Projeto: NaturalL/ReadWeibo

def generate_feature(wb, dict):
	fea = [0]*len(dict)
	# 微博文本
	for wd in seg_txt(wb.text.encode('utf-8','ignore')):
		word_count = 0
		wd = wd.lower().strip()
		if len(wd)>3 and wd in dict:
			fea[dict[wd]] += 1
			word_count += 1
		print 'found %d word in a weibo' % word_count

	# add user features
	owner = wb.owner
	fea.append(int(owner.w_province))
	fea.append(int(owner.w_city))
	if owner.w_url:
		fea.append(1)
	else:
		fea.append(0)
	fea.append(len(owner.w_description))
	if 'm' in owner.w_gender:
		fea.append(1)
	else:
		fea.append(0)

	fea.append(int(owner.w_followers_count))
	fea.append(int(owner.w_friends_count))
	fea.append(int(owner.w_statuses_count))
	fea.append(int(owner.w_favourites_count))
	fea.append(int(owner.w_bi_followers_count))
	fea.append((datetime.now()-owner.w_created_at).days/100)
	if owner.w_verified:
		fea.append(1)
	else:
		fea.append(0)


	# add weibo features
	fea.append(int(wb.reposts_count))
	fea.append(int(wb.comments_count))
	fea.append(int(wb.attitudes_count))
	if re.search("#.*?#", wb.text):
		fea.append(1)
	else:
		fea.append(0)

	fea.append(len(wb.text))
	own_text = re.search("(.*?)//@", wb.text)
	if own_text:
		fea.append(len(own_text.group(1)))
	else:
		fea.append(len(wb.text))
	#TODO 对source归类
	fea.append(len(wb.source))

	if wb.retweeted_status:
		fea.append(0)
	else:
		fea.append(1)

	if wb.thumbnail_pic:
		fea.append(1)
	else:
		fea.append(0)
	fea.append(wb.created_at.hour)
	fea.append(wb.created_at.weekday())
	# TODO 计算微博转发评论的衰减公式

	return fea

Exemplo n.º 36

0

Exibir arquivo

 def txt_tag_generator(self):
     word2id = self.word2id
     for k, v in self._txt_tag_generator():
         words = [i for i in list(seg_txt(str(k).lower())) if not i.isdigit()]
         yield word2id.id_list_by_word_list(words) , v

Exemplo n.º 37

0

Exibir arquivo

def tf_idf_seg_txt(txt):
    txt = txt.replace('。', ' ').replace('，', ' ')
    word_list = list(seg_txt(txt))
    return tf_idf(word_list)

Exemplo n.º 38

0

Exibir arquivo

 def tokenize(self, stream):
     import mmseg
     for chunk in self._imp_tokenizer.tokenize(stream):
         r = mmseg.seg_txt(chunk.encode('utf8', 'ignore'))
         for word in r:
             yield word.decode('utf8', 'ignore')

Exemplo n.º 39

0

Exibir arquivo

Arquivo: main.py Projeto: rainywh269/Experiments

#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
main.py
Author: WooParadog
Email:  [email protected]

Created on
2011-11-13
'''

import mmseg
import mmseg.search

f = open('text')
dic = {}
for word in mmseg.seg_txt(f.read()):
    if word in dic.keys():
        dic[word] = int(dic[word] + 1)
    else:
        dic[word] = 1
f.close()

import operator
word = sorted(dic.iteritems(), key=operator.itemgetter(1),reverse=True)
print word
f = open('word','w')
f.writelines([str(k)+":"+str(v)+"\n" for (k,v) in word])
f.close()

Exemplo n.º 40

0

Exibir arquivo

 def txt_tag_generator(self):
     word2id = self.word2id
     for k, v in self._txt_tag_generator():
         words = [i for i in list(seg_txt(str(k).lower())) if not i.isdigit()]
         yield word2id.id_list_by_word_list(words), v

Exemplo n.º 41

0

Exibir arquivo

Arquivo: searchengine.py Projeto: YauzZ/searchengine

	def separatewords(self,text):
		print  [s.lower() for s in seg_txt(text.encode('utf-8')) if s!='']
		return [s.lower() for s in seg_txt(text.encode('utf-8')) if s!='']

	def isindexed(self,url):
		u=self.con.execute \
		  ("select rowid from urllist where url='%s'" % url).fetchone()
		if u!=None:
			v=self.con.execute(
					'select * from wordlocation where urlid=%d' % u[0]).fetchone()
			if v!=None:
				print "indexed :",url
				return True
		return False

	def addlinkref(self,urlFrom,urlTo,linkText):
		fromid=self.getentryid('urllist','url',urlFrom)
		toid=self.getentryid('urllist','url',urlTo)

		cur=self.con.execute(
				"select rowid from link where fromid='%s' and toid='%s'" % (fromid,toid))
		res=cur.fetchone()
		if res==None:
			cur=self.con.execute(
				"insert into link (fromid,toid) values ('%s','%s')" %(fromid,toid))
			linkid=cur.lastrowid
		else:
		    linkid=res[0]

		words=self.separatewords(linkText)
		for word in words:
			wordid=self.getentryid('wordlist','word',word)
			cur=self.con.execute("insert into linkwords (wordid,linkid) values ('%s','%s')" %(linkid,wordid))

	def crawl(self,pages,depth=2):
		for i in range(depth):
			newpages=set()
			for page in pages:
				try:
					c=urllib2.urlopen(page)
				except:
					print "Could not open %s" % page
					continue
				soup=BeautifulSoup(c.read())

				if not self.isindexed(page):
					self.addtoindex(page,soup)
				else:
					continue

				links=soup('a')
				for link in links:
					if('href' in dict(link.attrs)):
						url=urljoin(page,link['href'])
						if url.find("'")!=-1: continue
						url=url.split('#')[0]
						if url[0:4]=='http' and not self.isindexed(url):
							newpages.add(url)
						linkText=self.gettextonly(link)
						self.addlinkref(page,url,linkText)

				self.dbcommit()
			pages=newpages

	def createindextables(self):
		self.con.execute('create table IF NOT EXISTS urllist(url)')
		self.con.execute('create table IF NOT EXISTS wordlist(word)')
		self.con.execute('create table IF NOT EXISTS wordlocation(urlid interger,wordid interger,location)')
		self.con.execute('create table IF NOT EXISTS link(fromid integer,toid integer)')
		self.con.execute('create table IF NOT EXISTS linkwords(wordid interger,linkid interger)')
		self.con.execute('create index IF NOT EXISTS wordidx on wordlist(word)')
		self.con.execute('create index IF NOT EXISTS urlidx on urllist(url)')
		self.con.execute('create index IF NOT EXISTS wordurlidx on wordlocation(wordid)')
		self.con.execute('create index IF NOT EXISTS urltoidx on link(toid)')
		self.con.execute('create index IF NOT EXISTS urlfrom on link(fromid)')
		self.dbcommit()

	def calculatepagerank(self,iterations=20):
		self.con.execute('drop table if exists pagerank')
		self.con.execute('create table pagerank(urlid primary key,score)')
		self.con.execute('insert into pagerank select rowid, 1.0 from urllist')
		self.dbcommit()
		for i in range(iterations):
			print "Iteration %d" % (i)
			for (urlid,) in self.con.execute('select rowid from urllist'):
				pr=0.15
				for (linker,) in self.con.execute('select distinct fromid from link where toid=%d' % urlid):
					linkingpr=self.con.execute('select score from pagerank where urlid=%d' % linker).fetchone()[0]
					linkingcount=self.con.execute('select count(*) from link where fromid=%d' % linker).fetchone()[0]
					pr+=0.85*(linkingpr/linkingcount)
					self.con.execute('update pagerank set score=%f where urlid=%d' % (pr,urlid))
		self.dbcommit()

Exemplo n.º 42

0

Exibir arquivo

Arquivo: hkust_segment.py Projeto: choojunyup/kaldi_aspire

#!/usr/bin/env python
#coding:utf-8

import sys
from __future__ import print_function
from mmseg import seg_txt
for line in sys.stdin:
  blks = str.split(line)
  out_line = blks[0]
  for i in range(1, len(blks)):
    if blks[i] == "[VOCALIZED-NOISE]" or blks[i] == "[NOISE]" or blks[i] == "[LAUGHTER]":
      out_line += " " + blks[i]
      continue
    for j in seg_txt(blks[i]):
      out_line += " " + j
  print(out_line)

Exemplo n.º 43

0

Exibir arquivo

 def append(self, txt):
     for i in set(seg_txt(str(txt.lower()))):
         self._idf[i] += 1
     self._count += 1

Exemplo n.º 44

0

Exibir arquivo

Arquivo: hkust_segment.py Projeto: JieLuoSC/kaldi-git

#coding:utf-8
#!/usr/bin/env python
import sys
from mmseg import seg_txt
for line in sys.stdin:
  blks = str.split(line)
  out_line = blks[0]
  for i in range(1, len(blks)):
    if blks[i] == "[VOCALIZED-NOISE]" or blks[i] == "[NOISE]" or blks[i] == "[LAUGHTER]":
      out_line += " " + blks[i]
      continue
    for j in seg_txt(blks[i]):
      out_line += " " + j
  print out_line

Exemplo n.º 45

0

Exibir arquivo

Arquivo: hkust_segment.py Projeto: hongyuntw/ESPnet

#!/usr/bin/env python3
# coding:utf-8

import sys

from mmseg import seg_txt

for line in sys.stdin:
    blks = str.split(line)
    out_line = blks[0]
    for i in range(1, len(blks)):
        if (blks[i] == "[VOCALIZED-NOISE]" or blks[i] == "[NOISE]"
                or blks[i] == "[LAUGHTER]"):
            out_line += " " + blks[i]
            continue
        for j in seg_txt(blks[i].encode()):
            out_line += " " + j.decode()
    print(out_line)