예제 #1
0
파일: write-db.py 프로젝트: chenghuige/gezi
def get_len(file_):
    count = 0
    for line in open(sys.argv[1]):
        uid, user_name, score, post_id, forum_name, title, content, feature_len, feature = line.rstrip(
            '\n').split('\t')
        if content > 1000:
            content = content[:1000]
        content = content.replace('"', '')
        title = title.replace('"', '')
        val = [
            monitor_type,
            str(count) + ' ' + score + ' ' + gezi.to_gbk(title),
            gezi.to_gbk(content),
            gezi.to_gbk(user_name),
            gezi.to_gbk(forum_name), uid
        ]
        score = float(score)
        if len(title.split('#')) < 2:
            continue
        count += 1
        if thre > 1:
            if count > thre:
                break
        else:
            if score < thre:
                break
        if thre2 != 0:
            if thre2 > 1:
                if count > thre2:
                    break
        else:
            if score < thre2:
                break
    return count
예제 #2
0
def deal(li):
    count_set = set()
    thread_count_set = set()
    pic_count_set = set()
    for l in li:
        pid = l[1]
        title = gezi.to_gbk(l[-2])
        content = gezi.to_gbk(l[-1])

        count_set.add(pid)

        is_thread = False
        if gezi.is_thread(title):
            thread_count_set.add(pid)
            is_thread = True

        if is_thread and gezi.contains_pic(content):
            pic_count_set.add(pid)

    print l[0] + '\t' + str(len(count_set)) + '\t' + str(
        len(thread_count_set)) + '\t' + str(len(pic_count_set))
예제 #3
0
def deal(li):
    louzhu_uid = None
    m = {}
    for l in li:
        uid = l[1]
        title = gezi.to_gbk(l[2])
        if gezi.is_thread(title):
            louzhu_uid = uid
        else:
            if not uid in m:
                m[uid] = 1
            else:
                m[uid] += 1
    if louzhu_uid != None:
        for (uid, count) in m.items():
            if uid != louzhu_uid:
                print louzhu_uid + '\t' + uid + '\t' + str(count)
예제 #4
0
#!/usr/bin/env python
#coding=gbk
# ==============================================================================
#          \file   utf8togbk.py
#        \author   chenghuige
#          \date   2015-03-13 19:53:33.858613
#   \Description
# ==============================================================================

import sys, os
import gezi

for line in open(sys.argv[1]):
    print gezi.to_gbk(line),
예제 #5
0
#!/usr/bin/env python
#coding=gbk
# ==============================================================================
#          \file   op-filter.py
#        \author   chenghuige  
#          \date   2015-03-02 17:15:06.828377
#   \Description  
# ==============================================================================

import sys,os
sys.path.append('./')
import gezi 

for line in sys.stdin:
	l = line.rstrip().split('\t')
	opname = gezi.to_gbk(l[-1])
	if opname.startswith('Ìù°É'):
		print line,
 
예제 #6
0
#!/usr/bin/env python
#coding=gbk
# ==============================================================================
#          \file   cut-cn.py
#        \author   chenghuige
#          \date   2015-03-06 14:40:14.908346
#   \Description
# ==============================================================================

import sys, os
sys.path.append('./')
import gezi

sep = '\t'
for line in sys.stdin:
    l = line.rstrip('\n').split(sep)
    word = gezi.to_gbk(l[0])
    words = gezi.to_cnvec(gezi.extract_chinese(word))
    for word in words:
        print word
예제 #7
0
    uid = l[0]
    set_.add(uid)

for line in open(sys.argv[1]):
    uid, user_name, score, post_id, forum_name, title, content, feature_len, feature = line.rstrip(
        '\n').split('\t')
    if uid in set_:
        continue

    if content > 1000:
        content = content[:1000]
    content = content.replace('"', '')
    title = title.replace('"', '')
    val = [
        monitor_type,
        str(count) + ' ' + score + ' ' + gezi.to_gbk(title),
        gezi.to_gbk(content),
        gezi.to_gbk(user_name),
        gezi.to_gbk(forum_name), uid
    ]
    score = float(score)
    if len(title.split('#')) < 2:
        continue
    count += 1
    if thre > 1:
        if count > thre:
            break
    else:
        if score < thre:
            break
    if thre2 != 0: