def calAct(self,col,cond): r_dict = {} res = dbconn.query( "select " + col + " as act,count(*) as count from mfwuserfeed where "+ cond +" group by " + col ) for r in res: r_dict.update({r.act:r.count}) r_dict = self.calDictRate(r_dict) return r_dict
def calDense(self,col,cond): r_dict = {} res = dbconn.query( "select floor("+ col +"*10) as dense,count(*) as count from mfwuserfeed where "+ cond +" group by floor("+ col +"*10) " ) for r in res: r_dict.update({r.dense:r.count}) r_dict = self.calDictRate(r_dict) return r_dict
def fileToNickname(): comp = re.compile(u"(\d+)_(\d+).html") for feedDir in os.listdir(tempDir): if not os.path.isdir(tempDir +"/"+ feedDir): continue for filename in os.listdir(tempDir +"/"+ feedDir): m = comp.search(filename) if not m: continue if int(m.group(2)) == 1: userid = int(m.group(1)) if len(dbconn.query("select * from mfwuser where userid = $userid",vars=dict(userid=userid))) == 0: nickname = getNickName(userid) dbconn.insert("mfwuser",userid=userid,nickname=nickname)
def calActSummary(sql): res = dbconn.query(sql) summary = {} for r in res: actSummaryString = r.actSummaryString actStrings = actSummaryString[1:].split(u"$") for actString in actStrings: act,count = actString.split(u"|")[0], actString.split(u"|")[1] if summary.has_key(act): summary[act] = summary[act] + 1 else: summary.update({act:1}) summaryOrder = dictToOrderList(summary) return summaryOrder
def hasFeed(userid): res = dbconn.query("select * from mfwuserfeed where userid = $userid",vars=dict(userid=userid)) if len(res) > 0: return True return False
#coding:utf-8 from bs4 import BeautifulSoup import bs4,math,re,os from publicsettings import useridRange,dbconn,tempDir,articleidRange from datetime import datetime actypeDict = {} res = dbconn.query("select * from mfwactype") for r in res: actypeDict.update({r.name:r.id}) def hasMoreFeedPage(html): soup = BeautifulSoup(html, from_encoding="utf8") page = soup.find("div","f_turnpage") if not page: return False pages = page.get_text() if pages.find(u"末页") > -1: return True else: return False def getPagesAndCal(userid,lastpage): actDictList = [] for pagenumber in range(1,lastpage+1): html = open(tempDir + "/" + str(userid)[0:2] +"/" +str(userid) + "_" + str(pagenumber) + ".html").read() actDictList.extend( getFeed(html) ) if len(actDictList) == 0: dbconn.insert("mfwuserfeed", userid = userid,
def cal(className): if isinstance(className,list): tempName = "" for c in className: tempName = tempName + " " +c className = tempName if class_dict.has_key(className): class_dict[className] = class_dict[className] + 1 else: class_dict.update({className:1}) ##for mddDir in os.listdir(tempMddDir): ## if not os.path.isdir(tempMddDir +"/"+ mddDir): ## continue for filename in os.listdir(tempMddDir +"/1"): ana(open(tempMddDir+"/1/"+filename,"r").read()) f = open("d:/log/mmd.log","wb") for d in dictToOrderList(class_dict): f.write(d[0] + "," + str(d[1])+ "\r\n") """ res = dbconn.query("select distinct pid from mfwmdd where pid is not null") comp = re.compile(u"<title>(.+)地区旅游地图") for r in res: pid = r.pid html = open(tempMddDir+"/"+str(pid)[0]+"/"+str(pid)+".html","r").read() soup = BeautifulSoup(html,from_encoding="utf8") name = comp.search(unicode(soup.title)).group(1) dbconn.insert("mfwpid",pid=pid,name=name)
def start_requests(self): res = dbconn.query("select distinct pid from mfwmdd where pid is not null") for r in res: yield self.make_requests_from_url("http://www.mafengwo.cn/mdd/smap.php?mddid=" + str(r.pid))