def ReadDocProbVec(self,lDocNo,qid):
     lDocProbVec = [VectorC()] * len(lDocNo)
     for line in open(self.DataDir + '%s_%s_DocTopicProb' %(qid,self.DocProbNamePre)):
         DocNo,VecStr = line.strip().split('\t')
         Vector = VectorC()
         Vector.loads(VecStr)
         p = lDocNo.index(DocNo)
         lDocProbVec[p] = Vector   
     return lDocProbVec
Пример #2
0
    def Extract(self, qid, query, doc, Word2VecModel):
        EmbeddingFeatureExtractorC.Extract(self, qid, query, doc,
                                           Word2VecModel)
        hFeature = {}
        lQVec = self.FetchQTermEmbedding(query, Word2VecModel)
        if len(lQVec) != len(query.strip().split()):
            logging.warn('query [%s] only [%d/%d] found in word2vec', query,
                         len(lQVec), len(query.strip().split()))

        for field in self.lDocField:
            lTerm = doc.GetField(field).lower().split()
            lDVec = [
                VectorC(list(Word2VecModel[term])) for term in lTerm
                if term in Word2VecModel
            ]
            lNonStopTerm = [term for term in lTerm if not (term == '[OOV]')]
            if len(lDVec) != len(lNonStopTerm):
                logging.warn('doc [%s][%s] only [%d/%d] found in word2vec',
                             doc.DocNo, field, len(lDVec), len(lNonStopTerm))
            for SimMetric in self.lSimMetric:
                for MergeMetric in self.lMergeMetric:
                    score = self.CalcPairWiseSim(lQVec, lDVec, SimMetric,
                                                 MergeMetric)
                    FeatureName = self.FeatureName + field + SimMetric + MergeMetric
                    hFeature[FeatureName] = score

        return hFeature
    def FetchDocVec(self, TargetNo, IsQid=False):

        if self.DocVecInType == 'text':
            if not TargetNo in self.hDocVec:
                logging.warn('TargetNo [%s] doc vec not found in text',
                             TargetNo)
                return None
            return self.hDocVec[TargetNo]

        if self.DocVecInType == 'gensim':
            if IsQid:
                return self.GenerateGensimQVec(TargetNo)


#                 if self.QField == 'topic':
#                     TargetNo = 'TrecWebTrack_' + TargetNo
#                 else:
#                     TargetNo = 'TrecWebTrack_' + TargetNo + '_' + self.QField

            if not TargetNo in self.hDocNoInternalId:
                logging.warn(
                    'Target No [%s] no in doc no to internal id mapping',
                    TargetNo)
                return None
            TargetNo = self.hDocNoInternalId[TargetNo]  #transfer to SENT_%d
            if not TargetNo in self.DocVecModel:
                logging.warn('target no [%s] not in doc vec', TargetNo)
                return None
            VecArray = self.DocVecModel[TargetNo]
            return VectorC(list(VecArray))

        logging.error('doc vec in type [%s] not supportted', self.DocVecInType)
        return None
 def FetchDocTermEmbedding(self, doc, Word2VecModel):
     lTerm = doc.GetContent().lower().split()
     lVector = [
         VectorC(list(Word2VecModel[term])) for term in lTerm
         if term in Word2VecModel
     ]
     return lVector
 def ExtractForOne(self,ExpTerm):
     #calc the ave vector of q terms
     #calc the vector for exp terms
     score = 0
     hFeature = {}
     hFeature['word2vecsim'] = score
     QVector = VectorC()
     TermVector = VectorC()
     QTermCnt = 0
     
     if self.hTargetTerm[ExpTerm.term] == -1:
         print "term [%s] not appear in word2vec" %(ExpTerm.term)
         return hFeature
     
     TermVector = self.lVector[self.hTargetTerm[ExpTerm.term]]
     
     for qterm in ExpTerm.query.split():
         if self.hTargetTerm[qterm] == -1:
             print "qterm [%s] not appear in word2vec" %(qterm)
             continue  
         QTermCnt += 1
         QVector += self.lVector[self.hTargetTerm[qterm]]
     if QTermCnt == 0:
         return hFeature
     QVector /= float(QTermCnt)
     
     score = VectorC.cosine(QVector,TermVector)
     hFeature['word2vecsim'] = score
     return hFeature
    def FetchQTermEmbedding(self, query, Word2VecModel):
        lVector = []
        lQTerm = query.lower().split()

        for qt in lQTerm:
            if not qt in Word2VecModel:
                continue
            lVector.append(VectorC(list(Word2VecModel[qt])))
        return lVector
Пример #7
0
    def ProcessOneObj(self, ObjId, name):
        '''
        return lObjNeighbor=[objid,KL score] top self.NeighborNum
        '''

        #search in index, get top 1000
        query = TextBaseC.RawClean(name)
        if "" == query:
            return []
        lObjDoc = self.Searcher.RunQuery(query)

        lObjNeighbor = []

        ThisDesp = self.ObjCenter.FetchObjDesp(ObjId)
        ThisLm = LmBaseC(ThisDesp)
        ThisVec = VectorC(ThisLm.hTermTF)
        #         print '[%s:%s] desp : [%s]' %(ObjId,name,ThisDesp)
        if len(ThisLm.hTermTF) == 0:
            return []
        for ObjDoc in lObjDoc:
            Id = ObjDoc.DocNo
            if Id == ObjId:
                continue
            if not Id.startswith('/m/'):
                print "[%s %s] neighbor id [%s] format error" % (ObjId, name,
                                                                 Id)
                continue
#             print "get neighbor [%s] [%s]" %(Id,ObjDoc.GetContent())
#             NeighborDesp = ObjDoc.GetContent()
            NeighborLm = LmBaseC(ObjDoc)
            NeighborVec = VectorC(NeighborLm.hTermTF)
            if len(NeighborVec.hDim) == 0:
                continue
            score = VectorC.KL(ThisVec, NeighborVec)
            lObjNeighbor.append([Id, -score])


#             print "[%s %s] KL [%f]" %(ObjId,Id,score)
#             print "%s\n%s" %(json.dumps(ThisVec.hDim),json.dumps(NeighborVec.hDim))

        lObjNeighbor.sort(key=lambda item: item[1], reverse=True)
        print "[%s:%s] neighbor id score get" % (ObjId, name)
        return lObjNeighbor
    def GenerateEmbeddingFeatureVector(self, QVec, DocVec):
        ResVec = Word2VecC()
        if self.OverWrite:
            StFeatureDim = 1
        else:
            StFeatureDim = self.StFeatureDim
        if self.DistanceType == 'abs':
            ResVec = abs(QVec - DocVec)
        if self.DistanceType == 'raw':
            ResVec = QVec - DocVec
        if self.DistanceType == 'l2':
            ResVec = Word2VecC.PointWiseL2(QVec, DocVec)

        if self.DistanceType == 'cos':
            score = Word2VecC.cosine(QVec, DocVec)
            ResVec.hDim[0] = score

        FeatureVec = VectorC()
        for key, value in ResVec.hDim.items():
            NewKey = key + StFeatureDim
            FeatureVec.hDim[NewKey] = value
        return FeatureVec
Пример #9
0
 def TfIdfCosine(LmA,LmB,CtfCenter):
     
     
     if (LmA.len == 0) | ( LmB.len == 0):
         return 0
     
     vA = LmA.TransferToVectorWithIdf(CtfCenter)
     vB = LmB.TransferToVectorWithIdf(CtfCenter)
     
     score =  VectorC.cosine(vA, vB)
     
     print "cosine [%f] of:\n%s\n%s" %(score, json.dumps(vA.hDim),json.dumps(vB.hDim))
     return score
    def GenerateGensimQVec(self, Qid):
        query = self.hQidQuery[Qid]
        lQTerm = query.split()

        lQTerm = [
            term.lower() for term in lQTerm if term.lower() in self.DocVecModel
        ]
        logging.info('calculating avg vec of q term [%s] for q [%s]',
                     json.dumps(lQTerm), query)
        lQArray = [self.DocVecModel[term] for term in lQTerm]
        if len(lQArray) == 0:
            return None
        MeanArray = lQArray[0]
        for QArray in lQArray[1:]:
            MeanArray += QArray
        MeanArray /= float(len(lQArray))
        return VectorC(list(MeanArray))
Пример #11
0
    def CalcPairWiseSim(self, lQVec, lDVec, SimMetric, MergeMetric):

        score = -1
        cnt = 0
        for QVec in lQVec:
            for DVec in lDVec:
                ThisScore = VectorC.Similarity(QVec, DVec, SimMetric)
                cnt += 1
                if -1 == score:
                    score = ThisScore
                    continue
                if MergeMetric == 'min':
                    score = min(score, ThisScore)
                if MergeMetric == 'max':
                    score = max(score, ThisScore)
                if MergeMetric == 'mean':
                    score += ThisScore
        if MergeMetric == 'mean':
            if cnt != 0:
                score /= float(cnt)
        return score
Пример #12
0
 def TfIdfCosine(LmA,LmB,CtfCenter):
     vA = VectorC(LmA.hTermTF)
     vB = VectorC(LmB.hTermTF)
     
     if (LmA.len == 0) | ( LmB.len == 0):
         return 0
     
     vA /= LmA.len
     vB /= LmB.len
     
     
     for item in vA.hDim:
         CTF = CtfCenter.GetCtfProb(item)
         vA.hDim[item] *= math.log(1.0/CTF)
     for item in vB.hDim:
         CTF = CtfCenter.GetCtfProb(item)
         vB.hDim[item] *= math.log(1.0/CTF)
     
     
     score =  VectorC.cosine(vA, vB)
     
     print "cosine [%f] of:\n%s\n%s" %(score, json.dumps(vA.hDim),json.dumps(vB.hDim))
     return score
Пример #13
0
 def __init__(self,InData={},word = ""):
     VectorC.__init__(self,InData)
     self.word = word
     if type(InData) == str:
         self.SetLine(InData)