示例#1
0
 def Init(self):
     BoeLmC.Init(self)
     self.DocTextDir = ""
     self.ObjCenter = FbObjCacheCenterC()
     self.CtfCenter = TermCtfC()
     self.lInferenceWeight = [1,0,0]
     self.hDocText = {}
    def Init(self):
        ObjObjEdgeFeatureExtractorC.Init(self)
        self.FeatureName += 'TextSim'
#         self.lObjField = ['name','desp','alias']
#         self.lFieldSimMetric = ['coor','js','cosine']
        self.lObjField = ['desp']
        self.lFieldSimMetric = ['cosine']        
        self.CtfCenter = TermCtfC()
        self.TermCtfIn = ""
def Process(PairCorrCntDictInName, CtfInName, OutName, SimMetric='tfidf'):
    hPairCnt = pickle.load(open(PairCorrCntDictInName))
    logging.info('pair cnt loaded')

    ObjCtfCenter = TermCtfC()
    ObjCtfCenter.Load(CtfInName)

    hPairCorr = {}

    logging.info('start to calc obj corpus ana similarity')
    cnt = 0
    for key, tf in hPairCnt.items():
        ObjA, ObjB = key.split()
        hPairCorr[ObjA + '\t' + ObjB] = CalcSimilarity(ObjA, ObjB, tf,
                                                       ObjCtfCenter, SimMetric)
        hPairCorr[ObjB + '\t' + ObjA] = CalcSimilarity(ObjB, ObjA, tf,
                                                       ObjCtfCenter, SimMetric)
        cnt += 1
        if 0 == (cnt % 1000):
            logging.info('processed [%d] pair', cnt)

    pickle.dump(hPairCorr, open(OutName, 'w'))
    logging.info('corr score dumped to [%s]', OutName)
    return
示例#4
0
    def CalcDocObjDistribution(cls, doc, lDocObj):
        DocLm = LmBaseC(doc)
        lObjLm = [LmBaseC(obj.GetDesp()) for obj in lDocObj]

        lDocObjScore = [
            LmBaseC.Similarity(ObjLm, DocLm, TermCtfC(), 'cosine')
            for ObjLm in lObjLm
        ]

        Z = float(sum(lDocObjScore))
        #         logging.debug('Doc obj dist Z= %f',Z)
        if 0 != Z:
            lDocObjScore = [item / Z for item in lDocObjScore]
        else:
            logging.warn('sum of doc obj scores is 0. raw scores:\n%s',
                         json.dumps(lDocObjScore))
#         lDocObjNoScore = zip([obj.GetId() for obj in lDocObj],lDocObjScore)
#         logging.debug('doc [%s] obj dist:\n%s',doc.DocNo,json.dumps(lDocObjNoScore))
        return lDocObjScore
示例#5
0
    def CalcObjDistributionOnQuery(cls, lQObj, lDocObj):
        lQObjLm = [LmBaseC(obj.GetDesp()) for obj in lQObj]
        lDocObjLm = [LmBaseC(obj.GetDesp()) for obj in lDocObj]

        llDObjQObjSim = [[
            LmBaseC.Similarity(QLm, DLm, TermCtfC(), 'cosine')
            for QLm in lQObjLm
        ] for DLm in lDocObjLm]

        lDObjSim = [sum(lDObjQObjSim) for lDObjQObjSim in llDObjQObjSim]

        #         logging.debug('obj-o obj sim mtx:\n %s',json.dumps(llDObjQObjSim))
        #         logging.debug('obj-q sim:\n %s',json.dumps(lDObjSim))

        Z = float(sum(lDObjSim))
        #         logging.debug('ObjDist on Q Z = %f',Z)
        if Z != 0:
            #             logging.debug('Q obj desp: [%s]',lQObj[0].GetDesp())
            #             logging.debug('doc obj desp: [%s]',lDocObj[0].GetDesp())
            lDObjSim = [item / Z for item in lDObjSim]
        else:
            logging.warn('doc obj has no similarity with q obj')
        return lDObjSim
 def Init(self):
     self.CtfCenter = TermCtfC()
     self.Word2VecFile = ""
     self.CateDenseCenter = CateAttCntDensityCenterC()#now is null
     self.FbObjCacheCenter = FbObjCacheCenterC()
class ObjVecMakerC(cxBaseC):
    
    def Init(self):
        self.CtfCenter = TermCtfC()
        self.Word2VecFile = ""
        self.CateDenseCenter = CateAttCntDensityCenterC()#now is null
        self.FbObjCacheCenter = FbObjCacheCenterC()
        
    def SetConf(self,ConfIn):
        conf = cxConf(ConfIn)
        self.CtfCenter.Load(conf.GetConf('termctf'))
        self.Word2VecFile = conf.GetConf('word2vec')
        self.CateDenseCenter.load(conf.GetConf('cateattdense'))
        self.FbObjCacheCenter.SetConf(ConfIn)
        print "inited"
        
    @staticmethod
    def ShowConf():
        print "termctf\nword2vec\ncateattdense"
        FbObjCacheCenterC.ShowConf()
    
    def MakeLmVec(self,lFbObj):
        lVector = []
        print "start make lm vec"
        for FbObj in lFbObj:
            desp = FbObj.GetDesp()
            Lm = LmBaseC()
            Lm.SetFromRawText(desp)
            Vector = VectorC()
            for term in Lm.hTermTF:
                score = Lm.GetTFProb(term) * math.log(1.0/self.CtfCenter.GetCtfProb(term))
                Vector.hDim[term] = score
            Vector.Key = FbObj.GetId()
            lVector.append(Vector)
        return lVector
    
    def MakeWord2Vec(self,lFbObjId):
        print "start make word2vec [%s]" %(self.Word2VecFile)
        lObjId = lFbObjId
        hObjP = dict(zip(lObjId,range(len(lObjId))))
        lVector = []
        for i in range(len(lObjId)):
            Vector = VectorC()
            Vector.Key = lObjId[i]
            lVector.append(Vector)
        
        reader = Word2VecReaderC()
        reader.open(self.Word2VecFile)
        print "start tarverse word2vec file [%s]" %(self.Word2VecFile)
        for word2vec in reader:
            if not word2vec.word in hObjP:
                continue
            p = hObjP[word2vec.word]
            lVector[p].hDim = word2vec.hDim
            print "get [%s]" %(lVector[p].Key)
        reader.close()
        return lVector
    
    
    def IsStopCate(self,cate):
        lStop = ['/common']
        for item in lStop:
            if item == cate[:len(item)]:
                return True
        return False
    
    def MakeCateAttCntVec(self,lFbObj):
        #require the cate att cnt in APIBase
        #and the cate att distribution (empirical) center
        lVector = []
        print "start make cate att cnt vec"
        for FbObj in lFbObj:
            Vector = VectorC()
            Vector.Key = FbObj.GetId()
            hCate = FbObj.FormCategoryAttCnt()
            print "cate for [%s]: \n%s" %(Vector.Key,json.dumps(hCate))
            for cate in hCate:
                if self.IsStopCate(cate):
                    continue
                cnt = hCate[cate]
                cdf = self.CateDenseCenter.GetProb(cate, cnt)
                print "cate [%s] prob[%f]" %(cate,cdf)
                Vector.hDim[cate] = cdf
            Vector.Normalize()
            lVector.append(Vector)
        return lVector
            
            
        
    def ProcessQObjFile(self,InName,OutName):
        #in: qid    query    objid
        #out: OutName_desp,OutName_cate,OutName_word2vec
        
        OutDesp = open(OutName + "_desp",'w')
        OutCate = open(OutName + "_cate",'w')
        OutWord2Vec = open(OutName + "_word2vec","w")
        
        lQidQuery = []
        lFbObjId = []
        lFbObjName = []
        #read objid
        for line in open(InName):
            vCol = line.strip().split('\t')
            lQidQuery.append([vCol[0],vCol[1]])
            FbObj = self.FbObjCacheCenter.FetchObj(vCol[2])
            lFbObjId.append(FbObj.GetId())
            lFbObjName.append(FbObj.GetName())
            lDespVec = self.MakeLmVec([FbObj])
            lCateVec = self.MakeCateAttCntVec([FbObj])
            try:
                print >> OutDesp,vCol[0] + "\t" + vCol[1] + '\t' + FbObj.GetId() + '\t' + FbObj.GetName() + '\t' + lDespVec[0].dumps()
                print >> OutCate,vCol[0] + "\t" + vCol[1] + '\t'+ FbObj.GetId() + '\t' + FbObj.GetName() + '\t' + lCateVec[0].dumps()
            except UnicodeEncodeError:
                print "unicode encode error, discard"
            FbObj.clear()
            
        
        print "fetched, lm and cate vecs made, start make vecs from word2vec"
        #extract and dump
        
        lWord2Vec = self.MakeWord2Vec(lFbObjId)
        print "dumping"
        for i in range(len(lQidQuery)):
            try:
                print >> OutWord2Vec,lQidQuery[i][0] + "\t" + lQidQuery[i][1] + '\t'+ lFbObjId[i] + '\t' + lFbObjName[i] + '\t' + lWord2Vec[i].dumps()
            except UnicodeEncodeError:
                print "unicode encode error, discard"
        
        OutDesp.close()
        OutCate.close()
        OutWord2Vec.close()
        print 'done'
        return True
 def SetConf(self, ConfIn):
     ObjObjEdgeFeatureExtractorC.SetConf(self, ConfIn)
     self.TermCtfIn = self.conf.GetConf('termctf')
     self.CtfCenter = TermCtfC(self.TermCtfIn)
CorrType = conf.GetConf('correlationmeasure')


lLine = open(TargetObjIn).read().splitlines()
hTargetObj = dict(zip(lLine,range(len(lLine))))
print "[%d] target obj load" %(len(hTargetObj))

hPair = {}
if PairDictIn != "":
    hPair = pickle.load(open(PairDictIn))
    print "[%d] pair cnt load" %(len(hPair))
else:
    if PairRawIn != "":
        hPair = FormPairDictFromRaw(PairRawIn)
    
CtfCenter = TermCtfC()
CtfCenter.Load(IdfDictIn)
print "df load"

print "forming neighbors..."
hTargetObjNeighbor = FormTargetObjNeighbors(hTargetObj, hPair, CtfCenter,CorrType)
print "dumpping results..."
DumpTargetObjTopNeighbor(hTargetObjNeighbor, OutName, NumOfNeighbor)
print "finished"





        
        
示例#10
0
class BoeLmWeighterC(BoeLmC):
    
    def __init__(self,ConfIn = ""):
        self.Init()
        if "" != ConfIn:
            self.SetConf(ConfIn)
    
    def Init(self):
        BoeLmC.Init(self)
        self.DocTextDir = ""
        self.ObjCenter = FbObjCacheCenterC()
        self.CtfCenter = TermCtfC()
        self.lInferenceWeight = [1,0,0]
        self.hDocText = {}
        
        
        
    def SetConf(self,ConfIn):
        conf = cxConfC(ConfIn)
        
        self.DocTextDir = conf.GetConf('doctextdir')
        self.LoadDocText()
        
        self.ObjCenter.SetConf(ConfIn)
        
        CtfInName = conf.GetConf('objctf')
        self.CtfCenter.Load(CtfInName)
        
    @classmethod
    def ShowConf():
        print 'doctextdir\nobjctf'
        FbObjCacheCenterC.ShowConf()
        
    def LoadDocText(self):
        for fname in WalkDir(self.DocTextDir):
            for line in open(fname):
                DocNo,text = line.strip().split('\t')
                self.hDocText[DocNo] = text    
        logging.info('doc text loaded')
        
    def GetAllIdf(self,DocKg):
        lItem = DocKg.hNodeId.items()
        lItem.sort(key=lambda item:item[1])
        lObjId = [item[0] for item in lItem]
        
        
        
        lRes = []
        for ObjId in lObjId:
            idf  = self.CtfCenter.GetLogIdf(ObjId)
            lRes.append(idf)
        return lRes
    
    def GetAllTf(self,DocKg):
        return list(DocKg.vNodeWeight)
    
    def GetAllTextCosine(self,DocKg):
        
        DocText = ""
        if DocKg.DocNo in self.hDocText:
            DocText = self.hDocText[DocKg.DocNo]
        
        lCos = []
        if "" == DocText:
            return [0] * len(DocKg)
        DocLm = LmBaseC(DocText)
        lItem = DocKg.hNodeId.items()
        lItem.sort(key=lambda item:item[1])
        lObjId = [item[0] for item in lItem]
        
        for ObjId in lObjId:
            desp = self.ObjCenter.FetchObjDesp(ObjId)
            lm = LmBaseC(desp)
            lCos.append(LmBaseC.Cosine(lm, DocLm))
            
            
        
        return lCos
    
    def GetTextCosine(self,ObjId,DocKg):
        DocText = ""
        if DocKg.DocNo in self.hDocText:
            DocText = self.hDocText[DocKg.DocNo]
        DocLm = LmBaseC(DocText)
        desp = self.ObjCenter.FetchObjDesp(ObjId)
        lm = LmBaseC(desp)
        score = LmBaseC.Cosine(lm, DocLm)
        if 0 == score:
            return self.MinLogProb
        return math.log(score)
        
    
    def LinearWeightTfIdfTextSim(self,ObjId,DocKg,TfScore = 1,IdfScore = 0, TextSimScore = 0):
        
        if not ObjId in DocKg:
            return self.MinLogProb
        
        
        lTf = np.zeros(len(DocKg))
        lIdf = np.zeros(len(DocKg))
        if TfScore != 0:
            lTf = np.array(self.GetAllTf(DocKg))
        if IdfScore != 0:
            lIdf = np.array(self.GetAllIdf(DocKg))
#         lCos = np.array(self.GetAllTextCosine(DocKg))
        TextSim = 0
        if TextSimScore != 0:
            TextSim = self.GetTextCosine(ObjId,DocKg)
        W = np.array([TfScore,IdfScore,TextSimScore])
        
        W = W / float(sum(W))
        
        lScore = lTf * W[0] + lIdf * W[1]
        
        res = self.MinLogProb * (W[0] + W[1])
        if ObjId in DocKg:
            p = DocKg.hNodeId[ObjId]
            res = lScore[p]
        res = res + TextSim * TextSimScore
        return res
        
    def inference(self, ObjId, DocKg):
        return self.LinearWeightTfIdfTextSim(ObjId, DocKg, self.lInferenceWeight[0], self.lInferenceWeight[1], self.lInferenceWeight[2])
what I do:
I get the idf of mesh term from the doc->MeSH ana dict
what's my input:
DocMeSHDict
what's my output:
termctf.dump

'''

import site
site.addsitedir('/bos/usr0/cx/PyCode/cxPyLib')
from IndriRelate.CtfLoader import TermCtfC

import sys
import pickle

if 3 != len(sys.argv):
    print "DocMeSHDict + outname"
    sys.exit()
    
hDocMeSH = pickle.load(open(sys.argv[1]))

CtfCenter = TermCtfC()

for DocNo,lAna in hDocMeSH.items():
    for UI,term in lAna:
        CtfCenter.insert(UI)
        
CtfCenter.dump(sys.argv[2])
print "done"