def getTemplate(util_gt, running_time, perSlot=3, step_tp=24, Nh_tp=7, Nl_tp=7, retCache=None, basetime=None, userList=set()): if len(userList)==0: filename='%s-Nh%s-lasts%s.template_all' %(helper.timestamp2str(running_time),Nh_tp,Nl_tp) if os.path.exists(filename): return pickle.load(open(filename,'r')) template_vecs={} outretEvent={} if basetime==None: basetime={} if len(userList)==0: for line in retCache: if line['event_id'] not in outretEvent: outretEvent[line['event_id']]=[] outretEvent.get(line['event_id']).append(line) else: for user in userList: if user in retCache: for line in retCache[user]: if line['event_id'] not in outretEvent: outretEvent[line['event_id']]=[] outretEvent[line['event_id']].append(line) for event in util_gt: event_id=event.db_event_id # corpus_src_init, userInvolveList, outret = db.getText(timePoint=running_time, historySlots= Nh * perSlot, detectionSlots=Nd * perSlot, output_prefix='data/%s_pred_%s_his%sh_ev%s' %(jobID ,helper.timestamp2str(running_time), Nh*perSlot, event_id), userList= selUsers, event_id=event_id, cacheFlag=True) if event_id in outretEvent: eventContents=outretEvent[event_id] if basetime!=None: if event_id in basetime: baseTimestamp=basetime[event_id] else: baseTimestamp=min(eventContents, key=lambda x:x['dsttime'])['dsttime'] # basetime[event_id]=baseTimestamp else: baseTimestamp=min(eventContents, key=lambda x:x['dsttime'])['dsttime'] # basetime[event_id]=baseTimestamp countDict={} templateVec=[] for every in eventContents: countDict[int((every['dsttime']-baseTimestamp)/(3600*perSlot))]=countDict.get(int((every['dsttime']-baseTimestamp)/(3600*perSlot)), 0)+1 for i in range(0, int(Nl_tp*step_tp/perSlot)): templateVec.append(countDict.get(i, 0)) # for i in range(0, NSd): # templateVec.append(countDict.get(i, 0)) # if sum(knownVec)>0: template_vecs[event_id]=templateVec # GT_vect[event_id]=GTVec if len(userList)==0: if saveTemplateToFileFlag: pickle.dump((template_vecs, basetime), open(filename, 'w')) return template_vecs, basetime else: return template_vecs
PParam = pr.predictionParam(PNSh=PNSh, PNSd=PNSd, PNSdLong=PNSdLong, PperSlot=PperSlot, PsimTemplateThres=PsimTemplateThres, templateTopSimSize=PtemplateTopSimSize, Nh_tp=PNh_tp, Nl_tp=PNl_tp, PNShSim=PNShSim, PNstep=PNstep, PNstepLong=PNstepLong, combineCount=combineCount) print '====== init %s %s ======' % (jobID, helper.timestamp2str(running_time)) dictionary, corpus_gt, tfidf_gt, index_gt, util_gt = ti.loadEventGT( dictionary, timePoint=running_time, step=step, gt_Nh=gt_Nh, gt_Nd=gt_Nd, jobID=jobID, inputEvent=set(), detectParam=DParam) eventlist_gt = set() for event in util_gt: eventlist_gt.add(event.db_event_id) corpus_src_init, userInvolveList_init, corpus_init_ret = db.getText( timePoint=running_time,
def loadEventGT(dictionary, timePoint=1347724800, step = 1, gt_Nh=0, gt_Nd=0, event_id=0, jobID='tmpgt_', inputEvent=set(), detectParam=None, verbose=False): ret= db.getEventInfo(startTime=timePoint, historySlots=gt_Nh*step, detectionSlots=gt_Nd*step, event_id=event_id) corpus_src_gt=[] util_gt=[] event_gt=[] if ret: # print 'gt_event:' for event in ret: if len(inputEvent)>0: if event['event_id'] not in inputEvent: continue if event['event_id_time']<(timePoint-gt_Nh*24*3600): continue # keyword=[word for word in event['split_words'].split(';') if word] # keyword=list(set(keyword)) # gt.append(keyword) # print 'ID=%s: ABS=%s TIT=%s' %(event['event_id'], event['abstract'], event['title']) keywords= str(event['title']) wordseg = wc.para2seglist(keywords, tag=True) seglist = wc.seglist4filter(wordseg, srcTag = True, filterLow=False, fromFile=False) # print seglist seglist=list(set(seglist)) # for element in seglist: # print element, # print '' corpus_src_gt.append(seglist) # print event['event_id'], # for word in seglist: # print word, # print '' # # [detectCount, previousErgency, db_event_id] if detectParam: detectThres= detectParam.detectThres else: detectThres=0 # eventReportTime=event['time_get'] # d = datetime.date(2015,1,5) # unixtime = time.mktime(d.timetuple()) a_gt_util = gt_util_param(event['event_id'], detectThres, int(event['reportTime']), int(event['event_id_time'])) util_gt.append(a_gt_util) print 'total gt size: %d' %len(corpus_src_gt) dictionary, corpus_gt, metas_gt = corpus_init(corpus_src_gt, dictionary, prefix='detect/%s_gt_%s_his%s_det%s_ev%s' %(jobID, helper.timestamp2str(timePoint), gt_Nh*step, gt_Nd*step, event_id), seg=False) tfidf_gt, index_gt = sims_init(corpus_gt, prefix='model/%s_gt_%s_his%sh_det%sh_ev%s' %(jobID, helper.timestamp2str(timePoint), gt_Nh * step, gt_Nd*step, event_id)) # print 'dict size2=%s'%len(dictionary) if verbose: for i, corpus in enumerate(corpus_gt): print 'gtid=%s'%util_gt[i].db_event_id for word in corpus: print dictionary[word[0]], print '' return dictionary, corpus_gt, tfidf_gt, index_gt, util_gt
dictionary= corpora.Dictionary() runFlags= de.TypeFlag(args.selType) DParam=de.detectParam(isNewDocThres=offline_sim_thres, detectThres=offline_detectThres, userSimThres= offline_userSimThres, ergCoef=ergCoef, costCoef=costCoef, baCoef= baCoef) if args.offtestFlag: offlineTestParam=de.detectParam(isNewDocThres=offline_sim_thres, recallDetectedCountThres=recallDetectedCountThres) if args.ontestFlag: onlineTestParam=de.detectParamOnline(isNewDocThres=online_sim_thres, onlineFilterExistThres=online_filter_thres, onlineCombineThres=online_combine_thres, recallDetectedCountThres=recallDetectedCountThres,precDetectedCountThres=precDetectedCountThres) PParam=pr.predictionParam(PNSh=PNSh, PNSd=PNSd, PNSdLong=PNSdLong, PperSlot=PperSlot, PsimTemplateThres=PsimTemplateThres, templateTopSimSize=PtemplateTopSimSize, Nh_tp=PNh_tp, Nl_tp=PNl_tp, PNShSim=PNShSim, PNstep=PNstep, PNstepLong=PNstepLong, combineCount=combineCount) print '====== init %s %s ======' %(jobID, helper.timestamp2str(running_time)) dictionary, corpus_gt, tfidf_gt, index_gt, util_gt = ti.loadEventGT(dictionary, timePoint=running_time, step=step, gt_Nh=gt_Nh, gt_Nd=gt_Nd, jobID=jobID, inputEvent= set(), detectParam= DParam) eventlist_gt=set() for event in util_gt: eventlist_gt.add(event.db_event_id) corpus_src_init, userInvolveList_init, corpus_init_ret = db.getText(timePoint=running_time, historySlots= Nh * step, timelastsSlots = Nl * step, output_prefix='data/%s_init_%s_%sh%sh%sh_ev%s' %(jobID ,helper.timestamp2str(running_time), Nh*step, Nd*step, Nl*step, len(eventlist_gt)), event_id=eventlist_gt, util_gt=util_gt, cacheFlag=True) template_vecs_all, basetime_all_init = pr.getTemplate(util_gt, running_time, perSlot=PParam.PperSlot, step_tp=PParam.step_tp, Nh_tp=PParam.Nh_tp, Nl_tp=PParam.Nl_tp, retCache=corpus_init_ret) PParam.template=(template_vecs_all, None, basetime_all_init) delta_time=time.time() - program_start_time_init print("--- init using %s seconds, or %s minutes ---" % (delta_time, delta_time/60.0)) program_start_time_select=time.time() # alg. swc jnt if args.selectFlag: print '====== selecting heu %s %s ====== ' %(jobID, helper.timestamp2str(running_time))
def selectUserHeu(c, k, userInvolveList, runFlags, dictionary, corpus_gt, tfidf_gt, index_gt, util_gt, running_time, step=1, Nh=0, Nl=0, jobID='tmp_user', eventlist=set(), selectSize=1, detectParam=None, predictParam=None): # print 'user before in init %d' %len(userInvolveList) userInfo = db.getUserInfoAll(userInvolveList) userInfoDict={} for user in userInfo: userInfoDict[user['uid']]=user #sort by fo desc, and cost asce userInfo=sorted(userInfo, key= lambda x: (x['followers_count'], -x['cost']), reverse=True) cur_c=0 cur_k=0 corpus_src_detect, userInv, corpus_ret = db.getText(timePoint=running_time, historySlots= Nh * step, timelastsSlots= Nl * step, output_prefix='data_sel/%s_heu_%s_his%sh_evall' %(jobID, helper.timestamp2str(running_time), Nh*step), userList=userInvolveList, event_id=eventlist, cacheFlag=True) # print 'user after in init=%d' %len(userInv) print 'corpus detect size=%d' %len(corpus_ret) # print 'onestep ', type(corpus_ret), len(corpus_ret) # for cor in corpus_ret: # print type(cor),cor # selected_uid = [] if runFlags.extFlag: updated_util_gt_ext=copy.deepcopy(util_gt) updated_util_gt=copy.deepcopy(util_gt) blackUsers=set() PParam=predictParam for i in range(0,k): left_c = c- cur_c left_k = k- cur_k if runFlags.extFlag: # updated_util_gt=copy.deepcopy(updated_util_gt_ext) for i, event in enumerate(updated_util_gt_ext): if updated_util_gt[i].detectCount >=1: event.detectCount += 1#event.detectCount if event.detectCount>=event.detectThres: updated_util_gt[i].detectThres=0 updated_util_gt[i].detectCount=0 else: updated_util_gt[i].detectThres=1 updated_util_gt[i].detectCount=0 print 'selecting round for %d users, left_k=%s, left_c= %s' %(selectSize, left_k, left_c) if left_c <= 0 or left_k <= 0: print 'selected done!, cur_c=%s, cur_k=%s' %(cur_c, cur_k) return selected_uid else: userCand=set() bondCount=0 for user in userInfo: if user['uid'] in blackUsers: continue if user['uid'] in userInvolveList and user['uid'] not in selected_uid: if user['avr_rp_count']>0: if runFlags.bondFlag: if user['cost'] > costBond(left_c, left_k, detectParam.costCoef) or user['cost']<=0: bondCount+=1 continue userCand.add(user['uid']) if len(userCand)>0: print 'selecting from all_user %d' %len(userCand) if runFlags.bondFlag: print 'has bypassed %d bigger than bond' %bondCount if runFlags.preFlag: PParam.SelectedID=set(copy.deepcopy(selected_uid)) selSet, updated_util_gt, user_score, blackUsersStep, stepPredictionRMSES = selectUserOneStep(left_c, left_k, userCand, runFlags, userInfoDict, corpus_ret, dictionary, corpus_gt, tfidf_gt, index_gt, updated_util_gt, running_time, step=step, Nh=Nh, Nl=Nl, jobID=jobID, eventlist=eventlist, selectSize=selectSize, detectParam=detectParam, predictionParam=PParam) if len(selSet)==0: break print 'selected user %s with score %f, RMSE %s' %(str(selSet), user_score, getRMSEMap(stepPredictionRMSES)) else: selSet, updated_util_gt, user_score, blackUsersStep = selectUserOneStep(left_c, left_k, userCand, runFlags, userInfoDict, corpus_ret, dictionary, corpus_gt, tfidf_gt, index_gt, updated_util_gt, running_time, step=step, Nh=Nh, Nl=Nl, jobID=jobID, eventlist=eventlist, selectSize=selectSize, detectParam=detectParam, predictionParam=PParam) if len(selSet)==0: break print 'selected user %s with score %f' %(str(selSet), user_score) for user in selSet: selected_uid.append(user) cur_c +=userInfoDict[user]['cost'] cur_k += len(selSet) for user in blackUsersStep: blackUsers.add(user) else: break print 'selected done!, cur_c=%d, cur_k=%s' %(cur_c, cur_k) return selected_uid
def selectUserOneStep(left_c, left_k, involveUsers, runFlags, userInfoDict, corpus_ret, dictionary, corpus_gt, tfidf_gt, index_gt, util_gt, running_time, step=1, Nh=0, Nl=0, jobID='tmp_user', eventlist=set(), selectSize=1, detectParam=None, predictionParam=None): base_util=copy.deepcopy(util_gt) if runFlags.preFlag: PParam=predictionParam # PselectID=set(PParam.SelectedID) PParam.predictCache=corpus_ret known_vecs_all, gt_vecs_all, basetime_all = pr.getKnownAndGTAll(corpus_ret, util_gt, perSlot=PParam.PperSlot, NSh= PParam.PNSh, NSd=PParam.PNSd) all_users=[] userScores=[] cnt = 0 total_cnt = 0 split_size=10000 for userID in involveUsers: cnt += 1 total_cnt += 1 user = perUser() user.uid=userID user.running_time=running_time user.Nh=Nh # user.Nl=Nl user.step=step user.cost=userInfoDict[user.uid]['cost'] # user.corpus_ret=corpus_ret user.util_gt=util_gt user.runFlags=runFlags user.detectParam=detectParam if not runFlags.docFlag: user.jobID=jobID # user.eventlist=eventlist uset=set() uset.add(user.uid) user.corpus_src_detect, user.userInv = db.getText(timePoint=running_time, historySlots= Nh * step, timelastsSlots=Nl * step, output_prefix='data_sel/%s_user_%s_%s_his%sh_lasts%sh_evl0' %(jobID, user.uid, helper.timestamp2str(running_time), Nh*step, Nl*step), userList=uset, event_id=set(), verboseFlag=False,cacheRet=corpus_ret) user.dictionary=dictionary user.corpus_gt=corpus_gt user.tfidf_gt=tfidf_gt user.index_gt=index_gt else: # user.jobID=jobID uset=set() uset.add(user.uid) user.userInv=uset user.corpus_src_detect=corpus_ret.get(user.uid,[]) all_users.append(user) if runFlags.extFlag: if total_cnt< len(involveUsers): continue else: # ext_cnt=0 global parCount,totalCount,leftCount parCount = Value('i', 0) totalCount = Value('i', len(involveUsers)) leftCount = Value('i', left_k) for us in all_users: ret=parSelect(us) userScores.append(ret) else: if cnt <split_size and total_cnt < len(involveUsers): continue else: cnt=0 parCount = Value('i', len(userScores)) totalCount = Value('i', len(involveUsers)) leftCount = Value('i', left_k) workerThres=20 workerCount=multiprocessing.cpu_count() if workerCount > workerThres: workerCount = workerThres if runFlags.preFlag: pool = Pool(processes = workerCount, maxtasksperchild = 1600, initializer = parSelectInitP, initargs = (parCount, totalCount, leftCount, known_vecs_all, gt_vecs_all, basetime_all, PParam, )) chunk, extra= divmod(len(all_users), 40) if extra: chunk+=1 it=pool.imap_unordered(parSelect, all_users, chunk) else: pool = Pool(processes = workerCount, maxtasksperchild = 4800, initializer = parSelectInit, initargs = (parCount, totalCount, leftCount )) chunk, extra= divmod(len(all_users), 8) if extra: chunk+=1 it=pool.imap_unordered(parSelect, all_users, chunk)# for ret in it: userScores.append(ret) pool.close() pool.join() all_users=[] pool=None blackUsers=set() for user in userScores: if user[1]<=0: blackUsers.add(user[0]) elif runFlags.preFlag: tempRMSE=numpy.mean(getRMSEMap(user[3])) if tempRMSE>phonyRMSE: blackUsers.add(user[0]) # if numpy.isnan(tempRMSE): # user[3][3]=phonyRMSE userScores=[user for user in userScores if user[0] not in blackUsers] if runFlags.preFlag: userScoresSorted= sorted(userScores, key = lambda x:x[1]+detectParam.baCoef*numpy.mean(getRMSEMap(x[3])), reverse=False) else: userScoresSorted= sorted(userScores, key = lambda x:x[1], reverse=False) userScores=userScoresSorted selSet=[] selCost=0 selID=set() selScore=0.0 similarCount=0 noSmallScoreCount=1 predErrCount=0 candidateUser=None while len(selSet)<min(selectSize, left_k) and len(userScores)>0 and selCost<left_c: candidateUser=userScores.pop() if runFlags.extFlag: if candidateUser[1]<=0: break if candidateUser[0] in selID: continue if len(selSet)>10: if candidateUser[1] < 0.0001 * float(selScore)/float(len(selSet)): break if userSim(candidateUser, selSet, thres=detectParam.userSimThres, countThres=math.ceil(selectSize * detectParam.userSimCountCoef)): similarCount+=1 # print 'user %s similar, unselect' %user[0] continue selSet.append(candidateUser) selCost+=userInfoDict[candidateUser[0]]['cost'] selID.add(candidateUser[0]) selScore+=candidateUser[1] eCount=0 tCount=0 for i, aut in enumerate(candidateUser[2]): if aut.detectCount>base_util[i].detectCount: eCount+=1 if aut.detectThres<=aut.detectCount: tCount+=1 if runFlags.preFlag: print 'user %s selected, score %s, count %s, overcount %s, rmse %s' %(candidateUser[0], candidateUser[1], eCount, tCount, getRMSEMap(candidateUser[3])) sys.stdout.flush() else: print 'user %s selected, score %s, count %s, overcount %s' %(candidateUser[0], candidateUser[1], eCount, tCount) sys.stdout.flush() if len(selSet) >= selectSize: break newUserScores=[] for user in userScores: updated_gt, updated_score = updateScore(user[2], candidateUser[2], base_util, userInfoDict[user[0]]['cost'], runFlags, detectParam) if runFlags.preFlag: newUserScores.append((user[0], updated_score, updated_gt, user[3])) else: newUserScores.append((user[0], updated_score, updated_gt)) if runFlags.preFlag: userScores= sorted(newUserScores, key = lambda x:x[1]+detectParam.baCoef*numpy.mean(getRMSEMap(x[3])), reverse=False) else: userScores= sorted(newUserScores, key = lambda x:x[1], reverse=False) base_util=copy.deepcopy(candidateUser[2]) print 'this round selected %s users: consider selecting %s, bypass similar %s' %(len(selID), len(involveUsers)-len(blackUsers), similarCount) sys.stdout.flush() if not runFlags.docFlag: corpus_src_detect, userInv = db.getText(timePoint=running_time, historySlots= Nh * step, output_prefix='data_sel/%s_userstep%s-%s_%s_his%sh_evall' %(jobID, left_k, len(selID), helper.timestamp2str(running_time), Nh*step), userList=selID, event_id=set(), cacheRet=corpus_ret) if len(userInv)<1: return selID, None, None dictionary, corpus_detect,metas = ti.corpus_init(corpus_src_detect, dictionary, prefix='model_sel/%s_userstep%s-%s_%s_his%sh_evall' %(jobID, left_k, len(selID), helper.timestamp2str(running_time), Nh*step), verbose=False, updateDict=False) step_util_gt, user_score = detectUtilWithGT(corpus_detect, dictionary, util_gt, corpus_gt, tfidf_gt, index_gt, runFlags, selCost, detectParam, countOnlyOnceFlag=False) else: corpus_src_detect=[] for uID in selID: corpus_src_detect.extend(corpus_ret.get(uID,[])) if runFlags.extFlag: step_util_gt, user_score = detectUtilWithDoc(corpus_src_detect, util_gt, runFlags, selCost, detectParam, countOnlyOnceFlag=True) else: step_util_gt, user_score = detectUtilWithDoc(corpus_src_detect, util_gt, runFlags, selCost, detectParam, countOnlyOnceFlag=False) step_util_gt=candidateUser[2] # if runFlags.preFlag: known_vecs_part_step, gt_vecs_part_step= pr.getKnownAndGTPart(util_gt=util_gt, running_time=running_time, selUsers=PParam.SelectedID.union(selID), perSlot=PParam.PperSlot, NSh=PParam.PNSh , NSd=PParam.PNSd, jobID='predict', inputCache=PParam.predictCache, basetime=basetime_all) template_part_step = pr.getTemplate(util_gt, running_time, perSlot=PParam.PperSlot, step_tp=PParam.step_tp, Nh_tp=PParam.Nh_tp, Nl_tp=PParam.Nl_tp, retCache=PParam.predictCache, userList=PParam.SelectedID.union(selID), basetime=PParam.template[2]) rmses= pr.predictResult((PParam.template[0], template_part_step, PParam.template[2]), known_vecs_part_step, known_vecs_all, gt_vecs_part_step, gt_vecs_all, NSh=PParam.PNSh, NSd=PParam.PNSd, NShSim=PParam.PNShSim, Nstep=PParam.PNstep, topSimSize=PParam.templateTopSimSize, simThres=PParam.PsimTemplateThres, combineCount=PParam.combineCount) PredictionRMSES = rmses return selID, step_util_gt, user_score, blackUsers, PredictionRMSES else: return selID, step_util_gt, user_score, blackUsers
def parSelect(user): global parCount,totalCount,leftCount if len(user.userInv)<1: return user.uid, -1.0 if not user.runFlags.docFlag: user.dictionary, corpus_detect, metas = ti.corpus_init(user.corpus_src_detect, user.dictionary, prefix='model_sel/%s_user_%s_%s_his%sh_evall' %(user.jobID, user.uid, helper.timestamp2str(user.running_time), user.Nh*user.step), verbose=False, outFlag=False, updateDict=False) tmp_util_gt, user_score = detectUtilWithGT(corpus_detect, user.dictionary, user.util_gt, user.corpus_gt, user.tfidf_gt, user.index_gt, user.runFlags, user.cost, user.detectParam) else: tmp_util_gt, user_score = detectUtilWithDoc(user.corpus_src_detect, user.util_gt, user.runFlags, user.cost, user.detectParam) countPoint=200 if user.runFlags.preFlag: #if nothing is detected, no need to calc preRMSE if user_score<=0: # countPoint=10 currentPredictionRMSES=[[phonyRMSE], [phonyRMSE], [phonyRMSE], [phonyRMSE]] else: global known_vecs_all_user,gt_vecs_all_user,basetime_all_user global PParam_user known_vecs_part_user, gt_vecs_part_user= pr.getKnownAndGTPart(util_gt=user.util_gt, running_time=user.running_time, selUsers=PParam_user.SelectedID.union(set([user.uid])), perSlot=PParam_user.PperSlot, NSh=PParam_user.PNSh , NSd=PParam_user.PNSd, jobID='predict', inputCache=PParam_user.predictCache, basetime=basetime_all_user) template_part_user = pr.getTemplate(user.util_gt, user.running_time, perSlot=PParam_user.PperSlot, step_tp=PParam_user.step_tp, Nh_tp=PParam_user.Nh_tp, Nl_tp=PParam_user.Nl_tp, retCache=PParam_user.predictCache, userList=PParam_user.SelectedID.union(set([user.uid])), basetime=PParam_user.template[2]) rmses= pr.predictResult((PParam_user.template[0], template_part_user, PParam_user.template[2]), known_vecs_part_user, known_vecs_all_user, gt_vecs_part_user, gt_vecs_all_user, NSh=PParam_user.PNSh, NSd=PParam_user.PNSd, NShSim=PParam_user.PNShSim, Nstep=PParam_user.PNstep, topSimSize=PParam_user.templateTopSimSize, simThres=PParam_user.PsimTemplateThres, select=True, combineCount=PParam_user.combineCount) currentPredictionRMSES= rmses parCount.value +=1 if(parCount.value % countPoint == 0): if user.runFlags.preFlag: print '%s/%s, left %s. user %s score %s rmse %s' %(parCount.value,totalCount.value,leftCount.value, user.uid,user_score, getRMSEMap(currentPredictionRMSES)) else: print '%s/%s, left %s. user %s score %s' %(parCount.value,totalCount.value,leftCount.value, user.uid,user_score) sys.stdout.flush() if user.runFlags.preFlag: return user.uid, user_score, tmp_util_gt, currentPredictionRMSES else: return user.uid, user_score, tmp_util_gt
def loadEventGT(dictionary, timePoint=1347724800, step=1, gt_Nh=0, gt_Nd=0, event_id=0, jobID='tmpgt_', inputEvent=set(), detectParam=None, verbose=False): ret = db.getEventInfo(startTime=timePoint, historySlots=gt_Nh * step, detectionSlots=gt_Nd * step, event_id=event_id) corpus_src_gt = [] util_gt = [] event_gt = [] if ret: # print 'gt_event:' for event in ret: if len(inputEvent) > 0: if event['event_id'] not in inputEvent: continue if event['event_id_time'] < (timePoint - gt_Nh * 24 * 3600): continue # keyword=[word for word in event['split_words'].split(';') if word] # keyword=list(set(keyword)) # gt.append(keyword) # print 'ID=%s: ABS=%s TIT=%s' %(event['event_id'], event['abstract'], event['title']) keywords = str(event['title']) wordseg = wc.para2seglist(keywords, tag=True) seglist = wc.seglist4filter(wordseg, srcTag=True, filterLow=False, fromFile=False) # print seglist seglist = list(set(seglist)) # for element in seglist: # print element, # print '' corpus_src_gt.append(seglist) # print event['event_id'], # for word in seglist: # print word, # print '' # # [detectCount, previousErgency, db_event_id] if detectParam: detectThres = detectParam.detectThres else: detectThres = 0 # eventReportTime=event['time_get'] # d = datetime.date(2015,1,5) # unixtime = time.mktime(d.timetuple()) a_gt_util = gt_util_param(event['event_id'], detectThres, int(event['reportTime']), int(event['event_id_time'])) util_gt.append(a_gt_util) print 'total gt size: %d' % len(corpus_src_gt) dictionary, corpus_gt, metas_gt = corpus_init( corpus_src_gt, dictionary, prefix='detect/%s_gt_%s_his%s_det%s_ev%s' % (jobID, helper.timestamp2str(timePoint), gt_Nh * step, gt_Nd * step, event_id), seg=False) tfidf_gt, index_gt = sims_init(corpus_gt, prefix='model/%s_gt_%s_his%sh_det%sh_ev%s' % (jobID, helper.timestamp2str(timePoint), gt_Nh * step, gt_Nd * step, event_id)) # print 'dict size2=%s'%len(dictionary) if verbose: for i, corpus in enumerate(corpus_gt): print 'gtid=%s' % util_gt[i].db_event_id for word in corpus: print dictionary[word[0]], print '' return dictionary, corpus_gt, tfidf_gt, index_gt, util_gt
def getTemplate(util_gt, running_time, perSlot=3, step_tp=24, Nh_tp=7, Nl_tp=7, retCache=None, basetime=None, userList=set()): if len(userList) == 0: filename = '%s-Nh%s-lasts%s.template_all' % ( helper.timestamp2str(running_time), Nh_tp, Nl_tp) if os.path.exists(filename): return pickle.load(open(filename, 'r')) template_vecs = {} outretEvent = {} if basetime == None: basetime = {} if len(userList) == 0: for line in retCache: if line['event_id'] not in outretEvent: outretEvent[line['event_id']] = [] outretEvent.get(line['event_id']).append(line) else: for user in userList: if user in retCache: for line in retCache[user]: if line['event_id'] not in outretEvent: outretEvent[line['event_id']] = [] outretEvent[line['event_id']].append(line) for event in util_gt: event_id = event.db_event_id # corpus_src_init, userInvolveList, outret = db.getText(timePoint=running_time, historySlots= Nh * perSlot, detectionSlots=Nd * perSlot, output_prefix='data/%s_pred_%s_his%sh_ev%s' %(jobID ,helper.timestamp2str(running_time), Nh*perSlot, event_id), userList= selUsers, event_id=event_id, cacheFlag=True) if event_id in outretEvent: eventContents = outretEvent[event_id] if basetime != None: if event_id in basetime: baseTimestamp = basetime[event_id] else: baseTimestamp = min(eventContents, key=lambda x: x['dsttime'])['dsttime'] # basetime[event_id]=baseTimestamp else: baseTimestamp = min(eventContents, key=lambda x: x['dsttime'])['dsttime'] # basetime[event_id]=baseTimestamp countDict = {} templateVec = [] for every in eventContents: countDict[int((every['dsttime'] - baseTimestamp) / (3600 * perSlot))] = countDict.get( int((every['dsttime'] - baseTimestamp) / (3600 * perSlot)), 0) + 1 for i in range(0, int(Nl_tp * step_tp / perSlot)): templateVec.append(countDict.get(i, 0)) # for i in range(0, NSd): # templateVec.append(countDict.get(i, 0)) # if sum(knownVec)>0: template_vecs[event_id] = templateVec # GT_vect[event_id]=GTVec if len(userList) == 0: if saveTemplateToFileFlag: pickle.dump((template_vecs, basetime), open(filename, 'w')) return template_vecs, basetime else: return template_vecs