def showLastDays(word, days): begdate, enddate = getTimeDomain(RFF.getDateList()) begdate = enddate - datetime.timedelta(days=days) spostdate = getPostDatebyTimeDomain(begdate, enddate, RFF.getPostDataList()) #开始统计词频 feqlist = [] timeline = [] x = 0 xdate = begdate while x < days: #初始化频率数组 feqlist.append(0) timeline.append(str(xdate.month) + "-" + str(xdate.day)) xdate += datetime.timedelta(days=1) x += 1 #sposdate:[ [内容,作者,时间],[......],...... ] for post in spostdate: if post[0].find(word) > -1: satpos = (datetime.datetime.strptime(post[2], "%Y-%m-%d %H:%M") - begdate).days feqlist[satpos - 1] += 1 #开始绘图 drawGraphic.linePlotGraphics( '时间', '出现次数(帖子/回帖总数:' + str(len(spostdate)) + ')', timeline, feqlist, '时间频率图(' + str(begdate.date()) + "->" + str(enddate.date()) + ")") print('>>>>>图像加载完毕')
def showLastYears(word, years): begdate, enddate = getTimeDomain(RFF.getDateList()) begdate = enddate - datetime.timedelta(days=years * 365) spostdate = getPostDatebyTimeDomain(begdate, enddate, RFF.getPostDataList()) #开始统计词频 feqlist = [] timeline = [] x = 0 xdate = begdate print("begdate=", begdate, "enddate=", enddate) while x <= years: #初始化频率数组 feqlist.append(0) timeline.append(str(xdate.year) + "年") print(str(xdate.year)) xdate += datetime.timedelta(days=365) x += 1 #sposdate:[ [内容,作者,时间],[......],...... ] for post in spostdate: if post[0].find(word) > -1: postdate = datetime.datetime.strptime(post[2], "%Y-%m-%d %H:%M") satpos = postdate.year - begdate.year print("satpos=", satpos, "\tpostdate=", postdate, "\tbegdate=", begdate, "\tyear1=", postdate.year, "\tyear2=", begdate.year) feqlist[satpos] += 1 #开始绘图 drawGraphic.linePlotGraphics( '时间', '出现次数(帖子/回帖总数:' + str(len(spostdate)) + ')', timeline, feqlist, '时间频率图(' + str(begdate.year) + "->" + str(enddate.year) + ")") print('>>>>>图像加载完毕')
def activeTimeAnaylize(authorname, days): print("获取数据集中的最近时间...") enddate = RFF.queryDatasourceLatestTime() spostdate = [] print("计算时间区间...") if days > 0: begdate = enddate - datetime.timedelta(days=days) print('时间区间:', begdate, '->', enddate) else: begdate = RFF.queryDatasourceEarlyTime() print('时间区间:', begdate, '->', enddate) print("获取回帖列表...") spostdate = RFF.queryContainListAfterTime(authorname, str(begdate)) llen = len(spostdate) print("开始统计.") #开始统计词频 tpostdata = sortandget(spostdate) tpostdata = gatherbyDays(tpostdata) # [ [date,[ countlist ] ], ] #for post in tpostdata: # print(str(post)) #开始分析活跃时间段 #每天的情况都分析一次,然后叠加求均值 # [ [date,[ countlist ] ], ] xvalue = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 ] FEQLIST = [] for post in tpostdata: feqlist = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ] for time in post[1]: hour = time.hour feqlist[hour] += 1 FEQLIST.append(feqlist) print(str(feqlist)) del tpostdata #平均下 avgfeq = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ] hour = 0 for x in avgfeq: sum = 0 for hoursum in FEQLIST: sum += hoursum[hour] avgfeq[hour] = sum hour += 1 print("after add up all :\n\n", str(avgfeq)) drawGraphic.linePlotGraphics( '时间(小时)', '发帖次数', xvalue, avgfeq, "【" + authorname + '】的活跃时间段图(共 ' + str(len(FEQLIST)) + " 天数据)")
def showLastDays(authorname,days): print("加载任务结果文件...") buf = RFF.openResult() datebuf = RFF.getDateList(buf) begdate,enddate = getTimeDomain(datebuf) del datebuf print("计算时间区间...") begdate = enddate - datetime.timedelta(days=days) print("解析回帖数据...") buf = RFF.getPostDataList(buf) spostdate = [] if days > 0: begdate = enddate - datetime.timedelta(days=days) spostdate = getPostDatebyTimeDomain(begdate,enddate,buf) else: spostdate = getPostDatebyTimeDomain(begdate,enddate,buf) del buf print("开始统计.") spostdate = getPostByAuthor(authorname,spostdate) llen = len(spostdate) #开始统计词频 feqlist = [] timeline = [] x = 0 xdate = begdate if days > 30: ommit_xlabel_per = days/30 #忽略x label的个数 ommit_xlabel_per-=1 #同上 while x<=days: feqlist.append(0) timeline.append(str(xdate.month)+"-"+str(xdate.day)) xdate += datetime.timedelta(days=1) feqlist[x] = getCountByDate(xdate,spostdate) x+=1 ppp = 0 while ppp < ommit_xlabel_per and x <= days: feqlist.append(0) timeline.append("") xdate += datetime.timedelta(days=1) feqlist[x] = getCountByDate(xdate,spostdate) x+=1 ppp+=1 xdate -= datetime.timedelta(days=1) timeline[len(timeline)-1] == str(xdate.date()) else: while x < days: #初始化频率数组 feqlist.append(0) timeline.append(str(xdate.month)+"-"+str(xdate.day)) xdate += datetime.timedelta(days=1) feqlist[x] = getCountByDate(xdate,spostdate) x+=1 #开始绘图 drawGraphic.linePlotGraphics('时间','出现次数(帖子/回帖总数:'+str(llen)+')',timeline,feqlist,"【"+ authorname +'】的活跃程度图('+ str(begdate.date()) + "->" + str(enddate.date()) +")") print('>>>>>图像加载完毕')
def showLastDays(word, days): print("加载任务结果文件...") buf = RFF.openResult() datebuf = RFF.getDateList(buf) begdate, enddate = getTimeDomain(datebuf) del datebuf print("计算时间区间...") begdate = enddate - datetime.timedelta(days=days) print("解析回帖数据...") buf = RFF.getPostDataList(buf) spostdate = getPostDatebyTimeDomain(begdate, enddate, buf) del buf print("开始统计.") #开始统计词频 feqlist = [] timeline = [] x = 0 xdate = begdate if days > 30: ommit_xlabel_per = days / 30 #忽略x label的个数 ommit_xlabel_per -= 1 #同上 while x <= days: feqlist.append(0) timeline.append(str(xdate.month) + "-" + str(xdate.day)) xdate += datetime.timedelta(days=1) x += 1 ppp = 0 while ppp < ommit_xlabel_per and x <= days: feqlist.append(0) timeline.append("") xdate += datetime.timedelta(days=1) x += 1 ppp += 1 xdate -= datetime.timedelta(days=1) timeline[len(timeline) - 1] == str(xdate.date()) else: while x < days: #初始化频率数组 feqlist.append(0) timeline.append(str(xdate.month) + "-" + str(xdate.day)) xdate += datetime.timedelta(days=1) x += 1 #sposdate:[ [内容,作者,时间],[......],...... ] for post in spostdate: if post[0].find(word) > -1: satpos = (datetime.datetime.strptime(post[2], "%Y-%m-%d %H:%M") - begdate).days feqlist[satpos - 1] += 1 #开始绘图 drawGraphic.linePlotGraphics( '时间', '出现次数(帖子/回帖总数:' + str(len(spostdate)) + ')', timeline, feqlist, "【" + word + '】的时间频率图(' + str(begdate.date()) + "->" + str(enddate.date()) + ")") print('>>>>>图像加载完毕')
def showLastDays(authorname, days): print("获取数据集中的最近时间...") enddate = RFF.queryDatasourceLatestTime() spostdate = [] print("计算时间区间...") if days > 0: begdate = enddate - datetime.timedelta(days=days) print('时间区间:', begdate, '->', enddate) else: begdate = RFF.queryDatasourceEarlyTime() print('时间区间:', begdate, '->', enddate) print("获取回帖列表...") spostdate = RFF.queryContainListAfterTime(authorname, str(begdate)) llen = len(spostdate) print("开始统计.") #开始统计词频 feqlist = [] timeline = [] x = 0 xdate = begdate if days > 30: ommit_xlabel_per = days / 30 #忽略x label的个数 ommit_xlabel_per -= 1 #同上 while x <= days: feqlist.append(0) timeline.append(str(xdate.month) + "-" + str(xdate.day)) xdate += datetime.timedelta(days=1) feqlist[x] = getCountByDate(xdate, spostdate) x += 1 ppp = 0 while ppp < ommit_xlabel_per and x <= days: feqlist.append(0) timeline.append("") xdate += datetime.timedelta(days=1) feqlist[x] = getCountByDate(xdate, spostdate) x += 1 ppp += 1 xdate -= datetime.timedelta(days=1) timeline[len(timeline) - 1] == str(xdate.date()) else: while x < days: #初始化频率数组 feqlist.append(0) timeline.append(str(xdate.month) + "-" + str(xdate.day)) xdate += datetime.timedelta(days=1) feqlist[x] = getCountByDate(xdate, spostdate) x += 1 #开始绘图 drawGraphic.linePlotGraphics( '时间', '出现次数(帖子/回帖总数:' + str(llen) + ')', timeline, feqlist, "【" + authorname + '】的活跃程度图(' + str(begdate.date()) + "->" + str(enddate.date()) + ")") print('>>>>>图像加载完毕')
def showLastDays(word,days): print("获取数据集中的最近时间...") enddate = RFF.queryDatasourceLatestTime() print("计算时间区间...") begdate = enddate - datetime.timedelta(days=days) print('时间区间:',begdate,'->',enddate) print("获取回帖列表...") spostdate = RFF.queryWordContainListAfterTime(str(begdate)) print("解析回帖数据...") #开始统计词频 feqlist = [] timeline = [] x = 0 xdate = begdate if days > 30: ommit_xlabel_per = days/30 #忽略x label的个数 ommit_xlabel_per-=1 #同上 while x<=days: feqlist.append(0) timeline.append(str(xdate.month)+"-"+str(xdate.day)) xdate += datetime.timedelta(days=1) x+=1 ppp = 0 while ppp < ommit_xlabel_per and x <= days: feqlist.append(0) timeline.append("") xdate += datetime.timedelta(days=1) x+=1 ppp+=1 xdate -= datetime.timedelta(days=1) timeline[len(timeline)-1] == str(xdate.date()) else: while x < days: #初始化频率数组 feqlist.append(0) timeline.append(str(xdate.month)+"-"+str(xdate.day)) xdate += datetime.timedelta(days=1) x+=1 # [ [主题帖链接,贴吧名,作者,帖子内容,发帖时间,回复给sb,所在页面],[......],..... ] for post in spostdate: if post[3].find(word) > -1: satpos = (post[4] - begdate).days feqlist[satpos-1]+=1 #开始绘图 drawGraphic.linePlotGraphics('时间','出现次数(帖子/回帖总数:'+str(len(spostdate))+')',timeline,feqlist,"【"+ word +'】的时间频率图('+ str(begdate.date()) + "->" + str(enddate.date()) +")") print('>>>>>图像加载完毕')
def singleWordTF(word, datalist, scale=30): #实现解析时间线,获取最小最大时间范围 begtime, endtime = getTimeDomain(RFF.getDateList()) print("对比日期范围:", begtime, "->", endtime) c = endtime - begtime blocks = int(c.days / scale) feqlist = [] timeline = [] x = 0 #初始化频率数组 print('>>>>>开始处理.....') xdate = begtime while x <= blocks: feqlist.append(0) timeline.append(str(xdate.date())) xdate += datetime.timedelta(days=scale) x += 1 # [ [[帖子标题,作者,发帖时间] , [回帖列表:[回帖内容,作者,回帖时间],[回帖内容,作者,回帖时间],[[......]],.....]] ] for post in datalist: if post[0][0].find(word) > -1: titledate = datetime.datetime.strptime(post[0][2], "%Y-%m-%d %H:%M") deltadate = titledate - begtime feqpos = int(deltadate.days / scale) feqlist[feqpos] += 1 replylist = post[1] for reply in replylist: if len(reply) < 3: continue if reply[0].find(word) > -1: replydate = datetime.datetime.strptime(reply[2], "%Y-%m-%d %H:%M") deltadate = replydate - begtime feqpos = int(deltadate.days / scale) feqlist[feqpos] += 1 print('>>>>>处理完成,加载图像中.....') print(str(feqlist)) print(str(timeline), str(feqlist)) #开始绘图 drawGraphic.linePlotGraphics( '时间', '出现次数(帖子/回帖总数:' + str(len(datalist * len(datalist[0][0][0]))) + ')', timeline, feqlist, '时间频率图(' + str(begtime) + "->" + str(endtime) + ")") print('>>>>>图像加载完毕')
def activeTimeAnaylize(authorname,days): buf = RFF.openResult() datebuf = RFF.getDateList(buf) begdate,enddate = getTimeDomain(datebuf) del datebuf spostdate = [] buf = RFF.getPostDataList(buf) if days > 0: begdate = enddate - datetime.timedelta(days=days) spostdate = getPostDatebyTimeDomain(begdate,enddate,buf) else: spostdate = getPostDatebyTimeDomain(begdate,enddate,buf) del buf spostdate = getPostByAuthor(authorname,spostdate) #[[内容,时间],[...],...] tpostdata = sortandget(spostdate) tpostdata = gatherbyDays(tpostdata) # [ [date,[ countlist ] ], ] #for post in tpostdata: # print(str(post)) #开始分析活跃时间段 #每天的情况都分析一次,然后叠加求均值 # [ [date,[ countlist ] ], ] xvalue = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23] FEQLIST = [] for post in tpostdata: feqlist = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] for time in post[1]: hour = time.hour feqlist[hour]+=1 FEQLIST.append(feqlist) print(str(feqlist)) del tpostdata #平均下 avgfeq = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] hour = 0 for x in avgfeq: sum = 0 for hoursum in FEQLIST: sum+=hoursum[hour] avgfeq[hour] = sum hour+=1 print("after add up all :\n\n",str(avgfeq)) drawGraphic.linePlotGraphics('时间(小时)','发帖次数',xvalue,avgfeq,"【"+ authorname +'】的活跃时间段图(共 '+ str(len(FEQLIST)) +" 天数据)")
def showLastYears(word,years): print("获取数据集中的最近时间...") enddate = RFF.queryDatasourceLatestTime() print("计算时间区间...") begdate = enddate - datetime.timedelta(days=years*365) print('时间区间:',begdate,'->',enddate) print("获取回帖列表...") spostdate = RFF.queryWordContainListAfterTime(str(begdate)) print("解析回帖数据...") #开始统计词频 feqlist = [] timeline = [] x = 0 xdate = begdate print("begdate=",begdate,"enddate=",enddate) while x <= years: #初始化频率数组 feqlist.append(0) timeline.append(str(xdate.year)+"年") print(str(xdate.year)) xdate += datetime.timedelta(days=365) x+=1 # [ [主题帖链接,贴吧名,作者,帖子内容,发帖时间,回复给sb,所在页面],[......],..... ] for post in spostdate: if post[3].find(word) > -1: postdate = post[4] satpos = postdate.year - begdate.year #print("satpos=",satpos,"\tpostdate=",postdate,"\tbegdate=",begdate,"\tyear1=",postdate.year,"\tyear2=",begdate.year) feqlist[satpos]+=1 #开始绘图 drawGraphic.linePlotGraphics('时间','出现次数(帖子/回帖总数:'+str(len(spostdate))+')',timeline,feqlist,"【"+ word +'】的时间频率图('+ str(begdate.year) + "->" + str(enddate.year) +")") print('>>>>>图像加载完毕') #该函数为辅助函数,用于找出时间区间 #返回值:最早时间,最近时间