def ReadDocProbVec(self,lDocNo,qid): lDocProbVec = [VectorC()] * len(lDocNo) for line in open(self.DataDir + '%s_%s_DocTopicProb' %(qid,self.DocProbNamePre)): DocNo,VecStr = line.strip().split('\t') Vector = VectorC() Vector.loads(VecStr) p = lDocNo.index(DocNo) lDocProbVec[p] = Vector return lDocProbVec
def Extract(self, qid, query, doc, Word2VecModel): EmbeddingFeatureExtractorC.Extract(self, qid, query, doc, Word2VecModel) hFeature = {} lQVec = self.FetchQTermEmbedding(query, Word2VecModel) if len(lQVec) != len(query.strip().split()): logging.warn('query [%s] only [%d/%d] found in word2vec', query, len(lQVec), len(query.strip().split())) for field in self.lDocField: lTerm = doc.GetField(field).lower().split() lDVec = [ VectorC(list(Word2VecModel[term])) for term in lTerm if term in Word2VecModel ] lNonStopTerm = [term for term in lTerm if not (term == '[OOV]')] if len(lDVec) != len(lNonStopTerm): logging.warn('doc [%s][%s] only [%d/%d] found in word2vec', doc.DocNo, field, len(lDVec), len(lNonStopTerm)) for SimMetric in self.lSimMetric: for MergeMetric in self.lMergeMetric: score = self.CalcPairWiseSim(lQVec, lDVec, SimMetric, MergeMetric) FeatureName = self.FeatureName + field + SimMetric + MergeMetric hFeature[FeatureName] = score return hFeature
def FetchDocVec(self, TargetNo, IsQid=False): if self.DocVecInType == 'text': if not TargetNo in self.hDocVec: logging.warn('TargetNo [%s] doc vec not found in text', TargetNo) return None return self.hDocVec[TargetNo] if self.DocVecInType == 'gensim': if IsQid: return self.GenerateGensimQVec(TargetNo) # if self.QField == 'topic': # TargetNo = 'TrecWebTrack_' + TargetNo # else: # TargetNo = 'TrecWebTrack_' + TargetNo + '_' + self.QField if not TargetNo in self.hDocNoInternalId: logging.warn( 'Target No [%s] no in doc no to internal id mapping', TargetNo) return None TargetNo = self.hDocNoInternalId[TargetNo] #transfer to SENT_%d if not TargetNo in self.DocVecModel: logging.warn('target no [%s] not in doc vec', TargetNo) return None VecArray = self.DocVecModel[TargetNo] return VectorC(list(VecArray)) logging.error('doc vec in type [%s] not supportted', self.DocVecInType) return None
def FetchDocTermEmbedding(self, doc, Word2VecModel): lTerm = doc.GetContent().lower().split() lVector = [ VectorC(list(Word2VecModel[term])) for term in lTerm if term in Word2VecModel ] return lVector
def ExtractForOne(self,ExpTerm): #calc the ave vector of q terms #calc the vector for exp terms score = 0 hFeature = {} hFeature['word2vecsim'] = score QVector = VectorC() TermVector = VectorC() QTermCnt = 0 if self.hTargetTerm[ExpTerm.term] == -1: print "term [%s] not appear in word2vec" %(ExpTerm.term) return hFeature TermVector = self.lVector[self.hTargetTerm[ExpTerm.term]] for qterm in ExpTerm.query.split(): if self.hTargetTerm[qterm] == -1: print "qterm [%s] not appear in word2vec" %(qterm) continue QTermCnt += 1 QVector += self.lVector[self.hTargetTerm[qterm]] if QTermCnt == 0: return hFeature QVector /= float(QTermCnt) score = VectorC.cosine(QVector,TermVector) hFeature['word2vecsim'] = score return hFeature
def FetchQTermEmbedding(self, query, Word2VecModel): lVector = [] lQTerm = query.lower().split() for qt in lQTerm: if not qt in Word2VecModel: continue lVector.append(VectorC(list(Word2VecModel[qt]))) return lVector
def ProcessOneObj(self, ObjId, name): ''' return lObjNeighbor=[objid,KL score] top self.NeighborNum ''' #search in index, get top 1000 query = TextBaseC.RawClean(name) if "" == query: return [] lObjDoc = self.Searcher.RunQuery(query) lObjNeighbor = [] ThisDesp = self.ObjCenter.FetchObjDesp(ObjId) ThisLm = LmBaseC(ThisDesp) ThisVec = VectorC(ThisLm.hTermTF) # print '[%s:%s] desp : [%s]' %(ObjId,name,ThisDesp) if len(ThisLm.hTermTF) == 0: return [] for ObjDoc in lObjDoc: Id = ObjDoc.DocNo if Id == ObjId: continue if not Id.startswith('/m/'): print "[%s %s] neighbor id [%s] format error" % (ObjId, name, Id) continue # print "get neighbor [%s] [%s]" %(Id,ObjDoc.GetContent()) # NeighborDesp = ObjDoc.GetContent() NeighborLm = LmBaseC(ObjDoc) NeighborVec = VectorC(NeighborLm.hTermTF) if len(NeighborVec.hDim) == 0: continue score = VectorC.KL(ThisVec, NeighborVec) lObjNeighbor.append([Id, -score]) # print "[%s %s] KL [%f]" %(ObjId,Id,score) # print "%s\n%s" %(json.dumps(ThisVec.hDim),json.dumps(NeighborVec.hDim)) lObjNeighbor.sort(key=lambda item: item[1], reverse=True) print "[%s:%s] neighbor id score get" % (ObjId, name) return lObjNeighbor
def GenerateEmbeddingFeatureVector(self, QVec, DocVec): ResVec = Word2VecC() if self.OverWrite: StFeatureDim = 1 else: StFeatureDim = self.StFeatureDim if self.DistanceType == 'abs': ResVec = abs(QVec - DocVec) if self.DistanceType == 'raw': ResVec = QVec - DocVec if self.DistanceType == 'l2': ResVec = Word2VecC.PointWiseL2(QVec, DocVec) if self.DistanceType == 'cos': score = Word2VecC.cosine(QVec, DocVec) ResVec.hDim[0] = score FeatureVec = VectorC() for key, value in ResVec.hDim.items(): NewKey = key + StFeatureDim FeatureVec.hDim[NewKey] = value return FeatureVec
def TfIdfCosine(LmA,LmB,CtfCenter): if (LmA.len == 0) | ( LmB.len == 0): return 0 vA = LmA.TransferToVectorWithIdf(CtfCenter) vB = LmB.TransferToVectorWithIdf(CtfCenter) score = VectorC.cosine(vA, vB) print "cosine [%f] of:\n%s\n%s" %(score, json.dumps(vA.hDim),json.dumps(vB.hDim)) return score
def GenerateGensimQVec(self, Qid): query = self.hQidQuery[Qid] lQTerm = query.split() lQTerm = [ term.lower() for term in lQTerm if term.lower() in self.DocVecModel ] logging.info('calculating avg vec of q term [%s] for q [%s]', json.dumps(lQTerm), query) lQArray = [self.DocVecModel[term] for term in lQTerm] if len(lQArray) == 0: return None MeanArray = lQArray[0] for QArray in lQArray[1:]: MeanArray += QArray MeanArray /= float(len(lQArray)) return VectorC(list(MeanArray))
def CalcPairWiseSim(self, lQVec, lDVec, SimMetric, MergeMetric): score = -1 cnt = 0 for QVec in lQVec: for DVec in lDVec: ThisScore = VectorC.Similarity(QVec, DVec, SimMetric) cnt += 1 if -1 == score: score = ThisScore continue if MergeMetric == 'min': score = min(score, ThisScore) if MergeMetric == 'max': score = max(score, ThisScore) if MergeMetric == 'mean': score += ThisScore if MergeMetric == 'mean': if cnt != 0: score /= float(cnt) return score
def TfIdfCosine(LmA,LmB,CtfCenter): vA = VectorC(LmA.hTermTF) vB = VectorC(LmB.hTermTF) if (LmA.len == 0) | ( LmB.len == 0): return 0 vA /= LmA.len vB /= LmB.len for item in vA.hDim: CTF = CtfCenter.GetCtfProb(item) vA.hDim[item] *= math.log(1.0/CTF) for item in vB.hDim: CTF = CtfCenter.GetCtfProb(item) vB.hDim[item] *= math.log(1.0/CTF) score = VectorC.cosine(vA, vB) print "cosine [%f] of:\n%s\n%s" %(score, json.dumps(vA.hDim),json.dumps(vB.hDim)) return score
def __init__(self,InData={},word = ""): VectorC.__init__(self,InData) self.word = word if type(InData) == str: self.SetLine(InData)