def indb(title,url,body): url = "http://baby.sina.com.cn%s" % url title = title.encode("utf8") kid = uid.getKid(url) text = html2text(body) pinyin = addpinyin(text) tag = c1.classify(body) print kid,title,url,tag os.popen(cmd % (kid,title,now.date(),tag)) mc = memcache.Client(['114.113.30.29:11211']) dbvalue=cjson.encode({"title":title.decode("utf8"),"url":url,"html":body,"text":text,"datetime":str(now),"addpinyin":pinyin,"body":text,"kid":kid}) mc.set(str(kid),dbvalue)
for title in re.findall(r'<title>(.*)</title>',s): try: return title.decode('utf8') except: return title return u'' path = sys.argv[1] pd = PyDystopia() mc = memcache.Client(['114.113.30.29:11211']) for fname in findpath(path): #fname = os.path.join(d,f) if fname.find("article")==-1:continue r=readtext(fname) if r=='':continue key = "%s" % hashlib.md5(fname).hexdigest() kid=string.atoi(key[:10],16) nowtime=datetime.datetime.now() print "key",key,kid,fname try: text = html2text(r) body = getbody(r.encode("utf8")) except: continue ttl = readtitle(fname) pinyin = addpinyin(body) dbvalue=cjson.encode({"title":ttl,"url":fname,"html":r,"text":text,"datetime":str(nowtime),"addpinyin":pinyin,"body":body,"kid":kid}) pd.put(kid,text.encode('utf8')) mc.set(str(kid),dbvalue) pd.commit()
# encoding: utf-8 try: import cmemcache as memcache except: import memcache from addpinyin import * import datetime import cjson import os import uid url = '' tag = '专家咨询' title = u'' text = u''' ''' pinyin = addpinyin(text) now = datetime.datetime.now() kid = uid.getKid(url) print kid,title mc = memcache.Client(['114.113.30.29:11211']) dbvalue=cjson.encode({"title":title,"url":url,"html":text,"text":text,"datetime":str(now),"addpinyin":pinyin,"body":text,"kid":kid}) mc.set(str(kid),dbvalue) cmd = 'tctmgr put infodb/infodb %s "title" "%s" "savedate" "%s" "tag1" "%s"' % (kid,title.encode("utf8"),now.date(),tag) print os.popen(cmd).read()