def __init__(self, pageph, hitph): 'init' self.ict = Ictclas('ICTCLAS50/') self.hitph = hitph self.pageph = pageph self.hitdoclist = Hitlist() #得分统计列表 self.wordbar = wordbar('../store/wordbar') #词库 以便得到wordID #hithash相关 self.hithasher = InitHashWid('../store/sortedwidhits', '../store/hithash') self.hithasher.initHashWid() #初始化hithash #init rank total 单个doc的score总和 self.ranktotal = InitRankTotal('../store/sorteddochits', '../store/tranks') self.ranktotal.initTotalRank() self.hits = [] #初始化pagerank self.pageranker = [] self.initPageranker() self.inithits() #初始化hits self.hithash = self.hithasher.hithash self.length = len(self.hits) #hits长度 #print 'length of hits is',self.length #排序 self.sorter = sorter()
def __init__(self,pageph,hitph): 'init' self.ict=Ictclas('ICTCLAS50/') self.hitph=hitph self.pageph=pageph self.hitdoclist=Hitlist() #得分统计列表 self.wordbar=wordbar('../store/wordbar') #词库 以便得到wordID #hithash相关 self.hithasher=InitHashWid('../store/sortedwidhits','../store/hithash') self.hithasher.initHashWid()#初始化hithash #init rank total 单个doc的score总和 self.ranktotal=InitRankTotal('../store/sorteddochits','../store/tranks') self.ranktotal.initTotalRank() self.hits=[] #初始化pagerank self.pageranker=[] self.initPageranker() self.inithits()#初始化hits self.hithash=self.hithasher.hithash self.length=len(self.hits) #hits长度 #print 'length of hits is',self.length #排序 self.sorter=sorter()
class Query: '查询库' def __init__(self,pageph,hitph): 'init' self.ict=Ictclas('ICTCLAS50/') self.hitph=hitph self.pageph=pageph self.hitdoclist=Hitlist() #得分统计列表 self.wordbar=wordbar('../store/wordbar') #词库 以便得到wordID #hithash相关 self.hithasher=InitHashWid('../store/sortedwidhits','../store/hithash') self.hithasher.initHashWid()#初始化hithash #init rank total 单个doc的score总和 self.ranktotal=InitRankTotal('../store/sorteddochits','../store/tranks') self.ranktotal.initTotalRank() self.hits=[] #初始化pagerank self.pageranker=[] self.initPageranker() self.inithits()#初始化hits self.hithash=self.hithasher.hithash self.length=len(self.hits) #hits长度 #print 'length of hits is',self.length #排序 self.sorter=sorter() def initPageranker(self): print 'init pageranker' f=open(self.pageph) lines=f.readlines() f.close() for l in lines: self.pageranker.append(float(l)) def inithits(self): f=open(self.hitph) lines=f.readlines() f.close() for l in lines: self.hits.append(l.split()) def query(self,strr): '单个查询' words=self.wordsplit(strr) #分词后的查询结果 #print '分词结果为',words for word in words.split(): #对每个word进行处理 print '--start to query word--',word wordid=self.wordbar.find(word) #需要查询的wordID print '查得的wordID为',wordid if wordid: hithashpos=self.hithasher.find([wordid,0]) #hithasher返回的为目标数据在hithash中的位置 if hithashpos: starthitpos=int(self.hithash[hithashpos][1]) print '查得的hitpos为',starthitpos #得到wordID在hits表中的片段地址 starthitpos endhitpos print '开始地址',starthitpos if starthitpos+1<self.length: endhitpos=int(self.hithash[hithashpos+1][1])-1 else: endhitpos=starthitpos else: continue else: continue #开始扫描片段 进行加分计算 index=starthitpos print '结束地址',endhitpos while index<=endhitpos: #开始加分处理 self.hitdoclist.find(self.hits[index]) index+=1 #对结尾进行还原 print '对结尾进行还原' self.hitdoclist.InitStatus() print 'the former doclist---------------------------' for i in self.hitdoclist: print i #将score转化为相对score #print '开始转为相对score' for i,score in enumerate(self.hitdoclist): #调整精度 getcontext().prec = 6 docid=score[0] rankpos=self.ranktotal.find([docid,0])#返回记录的位置 perrank=0 #对于每个记录最终的page值综合 for j,total in enumerate(self.ranktotal.tranks[rankpos]): #开始对每个总值进行扫描 将最终结果保存到 self.hitdoclist[i][-1]中 if j>0: ranktotal=self.ranktotal.tranks[rankpos][j] if int(ranktotal)==0: self.hitdoclist[i][j]=0 else: self.hitdoclist[i][j]=float(score[j])/float(ranktotal) #开始将每个标签的rank添加到总rank中 #title:9 h1:3 h2:2 h3:2 b:1 a:0.5 content:0.5 #开始加入pageranker print 'now calculate the pageranker with the result' print 'the docid is',self.hitdoclist[i][0] #self.hitdoclist[i][-1]= self.pageranker[ int(self.hitdoclist[i][0])]*( self.hitdoclist[i][1]*0.5 + self.hitdoclist[i][2]*0.056+ self.hitdoclist[i][3]*0.167 +self.hitdoclist[i][4]*0.11 + self.hitdoclist[i][5]*0.11 +self.hitdoclist[i][6]*0.027 +self.hitdoclist[i][7]*0.027 ) self.hitdoclist[i][-1]=self.hitdoclist[i][1] for k,summ in enumerate(self.hitdoclist[i]): if k>0: self.hitdoclist[i][-1]+=summ print 'start to print the former hitdoclist' for i in self.hitdoclist: print i self.sorter.run(self.hitdoclist) print 'the result' self.sorter.showlist() #return self.getResList() #返回结果字符串 给服务器 def wordsplit(self,sentence): '将查询语句分词' return self.ict.split(sentence) def getResList(self): strr='' for i in self.hitdoclist: strr+=str(i[0])+' ' return strr
class Query: '查询库' def __init__(self, pageph, hitph): 'init' self.ict = Ictclas('ICTCLAS50/') self.hitph = hitph self.pageph = pageph self.hitdoclist = Hitlist() #得分统计列表 self.wordbar = wordbar('../store/wordbar') #词库 以便得到wordID #hithash相关 self.hithasher = InitHashWid('../store/sortedwidhits', '../store/hithash') self.hithasher.initHashWid() #初始化hithash #init rank total 单个doc的score总和 self.ranktotal = InitRankTotal('../store/sorteddochits', '../store/tranks') self.ranktotal.initTotalRank() self.hits = [] #初始化pagerank self.pageranker = [] self.initPageranker() self.inithits() #初始化hits self.hithash = self.hithasher.hithash self.length = len(self.hits) #hits长度 #print 'length of hits is',self.length #排序 self.sorter = sorter() def initPageranker(self): print 'init pageranker' f = open(self.pageph) lines = f.readlines() f.close() for l in lines: self.pageranker.append(float(l)) def inithits(self): f = open(self.hitph) lines = f.readlines() f.close() for l in lines: self.hits.append(l.split()) def query(self, strr): '单个查询' words = self.wordsplit(strr) #分词后的查询结果 #print '分词结果为',words for word in words.split(): #对每个word进行处理 print '--start to query word--', word wordid = self.wordbar.find(word) #需要查询的wordID print '查得的wordID为', wordid if wordid: hithashpos = self.hithasher.find( [wordid, 0]) #hithasher返回的为目标数据在hithash中的位置 if hithashpos: starthitpos = int(self.hithash[hithashpos][1]) print '查得的hitpos为', starthitpos #得到wordID在hits表中的片段地址 starthitpos endhitpos print '开始地址', starthitpos if starthitpos + 1 < self.length: endhitpos = int(self.hithash[hithashpos + 1][1]) - 1 else: endhitpos = starthitpos else: continue else: continue #开始扫描片段 进行加分计算 index = starthitpos print '结束地址', endhitpos while index <= endhitpos: #开始加分处理 self.hitdoclist.find(self.hits[index]) index += 1 #对结尾进行还原 print '对结尾进行还原' self.hitdoclist.InitStatus() print 'the former doclist---------------------------' for i in self.hitdoclist: print i #将score转化为相对score #print '开始转为相对score' for i, score in enumerate(self.hitdoclist): #调整精度 getcontext().prec = 6 docid = score[0] rankpos = self.ranktotal.find([docid, 0]) #返回记录的位置 perrank = 0 #对于每个记录最终的page值综合 for j, total in enumerate(self.ranktotal.tranks[rankpos]): #开始对每个总值进行扫描 将最终结果保存到 self.hitdoclist[i][-1]中 if j > 0: ranktotal = self.ranktotal.tranks[rankpos][j] if int(ranktotal) == 0: self.hitdoclist[i][j] = 0 else: self.hitdoclist[i][j] = float( score[j]) / float(ranktotal) #开始将每个标签的rank添加到总rank中 #title:9 h1:3 h2:2 h3:2 b:1 a:0.5 content:0.5 #开始加入pageranker print 'now calculate the pageranker with the result' print 'the docid is', self.hitdoclist[i][0] #self.hitdoclist[i][-1]= self.pageranker[ int(self.hitdoclist[i][0])]*( self.hitdoclist[i][1]*0.5 + self.hitdoclist[i][2]*0.056+ self.hitdoclist[i][3]*0.167 +self.hitdoclist[i][4]*0.11 + self.hitdoclist[i][5]*0.11 +self.hitdoclist[i][6]*0.027 +self.hitdoclist[i][7]*0.027 ) self.hitdoclist[i][-1] = self.hitdoclist[i][1] for k, summ in enumerate(self.hitdoclist[i]): if k > 0: self.hitdoclist[i][-1] += summ print 'start to print the former hitdoclist' for i in self.hitdoclist: print i self.sorter.run(self.hitdoclist) print 'the result' self.sorter.showlist() #return self.getResList() #返回结果字符串 给服务器 def wordsplit(self, sentence): '将查询语句分词' return self.ict.split(sentence) def getResList(self): strr = '' for i in self.hitdoclist: strr += str(i[0]) + ' ' return strr