Пример #1
0
def indb(title,url,body):
	url = "http://baby.sina.com.cn%s" % url
	title = title.encode("utf8")
	kid = uid.getKid(url)
	text = html2text(body)
	pinyin = addpinyin(text)
	tag = c1.classify(body)
	print kid,title,url,tag
	os.popen(cmd % (kid,title,now.date(),tag))

	mc = memcache.Client(['114.113.30.29:11211'])
	dbvalue=cjson.encode({"title":title.decode("utf8"),"url":url,"html":body,"text":text,"datetime":str(now),"addpinyin":pinyin,"body":text,"kid":kid})
	mc.set(str(kid),dbvalue)
Пример #2
0
    for title in re.findall(r'<title>(.*)</title>',s):
        try:
            return title.decode('utf8')
        except:
            return title
    return u''

path = sys.argv[1]
pd = PyDystopia()
mc = memcache.Client(['114.113.30.29:11211'])
for fname in findpath(path):
    #fname = os.path.join(d,f)
    if fname.find("article")==-1:continue
    r=readtext(fname)
    if r=='':continue
    key = "%s" % hashlib.md5(fname).hexdigest()
    kid=string.atoi(key[:10],16)
    nowtime=datetime.datetime.now()
    print "key",key,kid,fname
    try:
        text = html2text(r)
        body = getbody(r.encode("utf8"))
    except:
        continue
    ttl = readtitle(fname)
    pinyin = addpinyin(body)
    dbvalue=cjson.encode({"title":ttl,"url":fname,"html":r,"text":text,"datetime":str(nowtime),"addpinyin":pinyin,"body":body,"kid":kid})
    pd.put(kid,text.encode('utf8'))
    mc.set(str(kid),dbvalue)
pd.commit()
Пример #3
0
# encoding: utf-8
try:
    import cmemcache as memcache
except:
    import memcache
from addpinyin import *
import datetime
import cjson
import os
import uid

url = ''
tag = '专家咨询'
title = u''
text = u'''

'''
pinyin = addpinyin(text)
now = datetime.datetime.now()

kid = uid.getKid(url)
print kid,title

mc = memcache.Client(['114.113.30.29:11211'])
dbvalue=cjson.encode({"title":title,"url":url,"html":text,"text":text,"datetime":str(now),"addpinyin":pinyin,"body":text,"kid":kid})
mc.set(str(kid),dbvalue)

cmd = 'tctmgr put infodb/infodb %s "title" "%s" "savedate" "%s" "tag1" "%s"' % (kid,title.encode("utf8"),now.date(),tag)
print os.popen(cmd).read()