Пример #1
0
    def Init(self):
        cxBaseC.Init(self)
        self.InDir = ""
        self.OutDir = ""

        self.QRelCenter = AdhocQRelC()
        self.hQueryQid = {}  #query name -> qid
Пример #2
0
    def Init(self):
        cxBaseC.Init(self)

        self.RelCenter = AdhocQRelC()
        self.InDir = ""
        self.OutDir = ""

        self.hNodeFeatureId = {}  #the id of node features
        self.hEdgeFeatureId = {}  #the id of edge features
 def Init(self):
     cxBaseC.Init(self)
     self.Prepared = False
     
     self.Word2VecInName = ""
     self.Word2VecModel = None
     
     self.lFeatureGroup = []
     self.Searcher = IndriSearchCenterC()
     self.GivenFeatureExtractor = LeToRGivenFeatureExtractorC()
     self.EmbTermPairFeatureExtractor = EmbeddingTermPairFeatureExtractorC()
     self.EmbLmFeatureExtractor = EmbeddingLmFeatureExtractorC()
     self.QRelCenter = AdhocQRelC()
     self.QRelIn = ""
class LeToRFeatureExtractCenterC(cxBaseC):
    
    def Init(self):
        cxBaseC.Init(self)
        self.Prepared = False
        
        self.Word2VecInName = ""
        self.Word2VecModel = None
        
        self.lFeatureGroup = []
        self.Searcher = IndriSearchCenterC()
        self.GivenFeatureExtractor = LeToRGivenFeatureExtractorC()
        self.EmbTermPairFeatureExtractor = EmbeddingTermPairFeatureExtractorC()
        self.EmbLmFeatureExtractor = EmbeddingLmFeatureExtractorC()
        self.QRelCenter = AdhocQRelC()
        self.QRelIn = ""
        
    
    def SetConf(self, ConfIn):
        cxBaseC.SetConf(self, ConfIn)
        self.Word2VecInName = self.conf.GetConf('word2vecin')
        
        self.lFeatureGroup = self.conf.GetConf('featuregroup')
        
        self.QRelIn = self.conf.GetConf('qrel')
        self.QRelCenter.Load(self.QRelIn)
        if type(self.lFeatureGroup) != list:
            self.lFeatureGroup = [self.lFeatureGroup]
            
        self.Searcher.SetConf(ConfIn)
        
        if 'givenfeature' in self.lFeatureGroup:
            self.GivenFeatureExtractor.SetConf(ConfIn)
            
        if 'termpairemb' in self.lFeatureGroup:
            self.EmbTermPairFeatureExtractor.SetConf(ConfIn)
            
        if 'emblm' in self.lFeatureGroup:
            self.EmbLmFeatureExtractor.SetConf(ConfIn)
            
            
        return True
    
    @staticmethod
    def ShowConf():
        cxBaseC.ShowConf()
        print 'word2vecin\nfeaturegroup givenfeature|termpairemb\nqrel\nemblm'
        LeToRGivenFeatureExtractorC.ShowConf()
        EmbeddingTermPairFeatureExtractorC.ShowConf()
        EmbeddingLmFeatureExtractorC.ShowConf()
        IndriSearchCenterC.ShowConf()
        
    def Prepare(self):
        if self.Prepared:
            return
        
        
        
        logging.info('start load word2vec input')
        self.Word2VecModel = gensim.models.Word2Vec.load_word2vec_format(self.Word2VecInName)
        logging.info('word2vec loaded')
        if 'givenfeature' in self.lFeatureGroup:
            self.GivenFeatureExtractor.Prepare()
        if 'termpairemb' in self.lFeatureGroup:
            self.EmbTermPairFeatureExtractor.Prepare()
        if 'emblm' in self.lFeatureGroup:
            self.EmbLmFeatureExtractor.Prepare()
        
        self.Prepared = True
        return
    
    def Process(self, qid,query,doc):
        '''
        extract all features here
        '''
        self.Prepare()
        
        
        hFeature = {}
        logging.debug('extracting for [%s][%s]',qid,doc.DocNo)
        if 'givenfeature' in self.lFeatureGroup:
            hFeature.update(self.GivenFeatureExtractor.Extract(qid, query, doc))
            logging.debug('given feature extracted')
        
        if 'termpairemb' in self.lFeatureGroup:
            hFeature.update(self.EmbTermPairFeatureExtractor.Extract(qid, query, doc, self.Word2VecModel))
            logging.debug('termpairemb feature extracted')
            
        if 'emblm' in self.lFeatureGroup:
            hFeature.update(self.EmbLmFeatureExtractor.Extract(qid, query, doc, self.Word2VecModel))
            logging.debug('emblm feature extracted')
            
        return hFeature
    
    
    def PipeLineRun(self,QInName,OutName):
        '''
        will make a feature hash myself... It should be OK right?
        '''
        hFeatureName = {}
        self.Prepare()
        lLines = open(QInName).read().splitlines()
        lQidQuery = [line.split('\t') for line in lLines]
        out = open(OutName,'w')
        
        logging.info('start extracting for file [%s]',QInName)
        for qid,query in lQidQuery:
            lDoc = self.Searcher.RunQuery(query, qid)
            for doc in lDoc:
                hFeature = self.Process(qid, query, doc)
                LTRData = LeToRDataBaseC()
                LTRData.qid = qid
                LTRData.DocNo = doc.DocNo
                LTRData.hFeature = hFeature
                
                LTRData.score = self.QRelCenter.GetScore(qid, doc.DocNo)
                hFeatureName = LTRData.HashFeatureName(hFeatureName)
                print >>out,LTRData.dumps()
                
            logging.info('qid [%s] extracted',qid)
            
        out.close()
        
        NameOut = open(OutName + '_FeatureName','w')
        for name,Id in hFeatureName.items():
            print >>NameOut,'%d\t%s' %(Id,name)
        NameOut.close()
        logging.info('finished')
        return
Пример #5
0
 def ShowConf(cls):
     cxBaseC.ShowConf()
     print cls.__name__
     print 'indir\noutdir'
     AdhocQRelC.ShowConf()
Пример #6
0
class GraphFeaturePostProcessorC(cxBaseC):
    def Init(self):
        cxBaseC.Init(self)

        self.RelCenter = AdhocQRelC()
        self.InDir = ""
        self.OutDir = ""

        self.hNodeFeatureId = {}  #the id of node features
        self.hEdgeFeatureId = {}  #the id of edge features

    def SetConf(self, ConfIn):
        cxBaseC.SetConf(self, ConfIn)

        self.InDir = self.conf.GetConf('indir') + '/'
        self.OutDir = self.conf.GetConf('outdir') + '/'
        self.RelCenter.SetConf(ConfIn)

    @classmethod
    def ShowConf(cls):
        cxBaseC.ShowConf()
        print cls.__name__
        print 'indir\noutdir'
        AdhocQRelC.ShowConf()

    def HashFeatureName(self, hFeature={}):
        '''
        go through the full input dir, hash node features and edge features
        '''
        if {} != hFeature:
            self.MakeFeatureHashFromNames(hFeature.keys())
        else:
            self.MakeFeatureFromRawData()

        return True

    def MakeFeatureFromRawData(self):
        sNodeFeatureName = set()
        sEdgeFeatureName = set()

        lFName = WalkDir(self.InDir)

        for FName in lFName:
            #             logging.info('checking feature names in [%s]',FName)
            lLines = open(FName).read().splitlines()
            lNodeLines = [
                line for line in lLines if self.IsNodeFeatureLine(line)
            ]
            lEdgeLines = [
                line for line in lLines if not self.IsNodeFeatureLine(line)
            ]

            sNodeFeatureName.update(self.GetFeatureName(lNodeLines))
            sEdgeFeatureName.update(self.GetFeatureName(lEdgeLines))

        self.MakeNodeFeatureHash(sNodeFeatureName)
        self.MakeEdgeFeatureHash(sEdgeFeatureName)

        logging.info('feature hash id assigned from raw data')
        return True

    def MakeFeatureHashFromNames(self, lName):

        lEdgeFeatureName = [
            name for name in lName
            if name.startswith('ObjObj') | name.startswith('QObj')
        ]
        sEdgeFeatureName = set(lEdgeFeatureName)
        sNodeFeatureName = set(lName) - sEdgeFeatureName

        self.MakeNodeFeatureHash(sNodeFeatureName)
        self.MakeEdgeFeatureHash(sEdgeFeatureName)

        logging.info('Node Feature hash: %s', json.dumps(self.hNodeFeatureId))
        logging.info('Edge Feature hash: %s', json.dumps(self.hEdgeFeatureId))

        return True

    def FindGlobalFeatureMaxMin(self):
        hMin = {}
        hMax = {}

        for QDir, mid, lFname in os.walk(self.InDir):
            if QDir == self.InDir:
                continue
            hQMax, hQMin = self.FindMaxMinFeatureValuesForQ(QDir)
            hMax = FeatureProcessorC.Max(hMax, hQMax)
            hMin = FeatureProcessorC.Min(hMin, hQMin)

        logging.info('Global feature max-min found')
        return hMax, hMin

    def MakeNodeFeatureHash(self, sNodeFeatureName):
        '''
        put LeToR features first
        '''
        lName = list(sNodeFeatureName)

        lLtrName = [name for name in lName if name.startswith('LeToR')]
        lObjName = [name for name in lName if not name.startswith('LeToR')]

        lLtrName.sort()
        lObjName.sort()

        lName = lLtrName + lObjName

        self.hNodeFeatureId = dict(zip(lName, range(len(lName))))

        return True

    def MakeEdgeFeatureHash(self, sEdgeFeatureName):
        '''
        put QObj features first
        '''
        lName = list(sEdgeFeatureName)

        lQObjName = [name for name in lName if name.startswith('QObj')]
        lObjObjName = [name for name in lName if not name.startswith('QObj')]

        lQObjName.sort()
        lObjObjName.sort()

        lName = lQObjName + lObjObjName

        self.hEdgeFeatureId = dict(zip(lName, range(len(lName))))

        return True

    def GetFeatureName(self, lLines):
        #         lhFeature = []
        #         for line in lLines:
        #             FStr = line.split('\t')[-1]
        #             try:
        #                 hFeature = json.loads(FStr)
        #                 lhFeature.append(hFeature)
        #             except ValueError:
        #                 logging.error('[%s] cannot be json loaded', FStr)
        #                 sys.exit()

        lhFeature = [json.loads(line.split('\t')[-1]) for line in lLines]

        lName = []
        for hFeature in lhFeature:
            lName.extend(hFeature.keys())

        return set(lName)

    def FindMaxMinFeatureValuesForQ(self, QDir):
        '''
        find the max and min feature values of this query
            so I perform max-min normalization per query level
            Should work too and is simple
        '''
        hFeatureMax = {}
        hFeatureMin = {}
        lDocName = WalkDir(QDir)
        for DocName in lDocName:
            logging.info('finding max min of [%s]', DocName)
            for line in open(DocName):
                vCol = line.strip().split('\t')
                hFeature = json.loads(vCol[-1])
                hFeatureMax = FeatureProcessorC.Max(hFeature, hFeatureMax)
                hFeatureMin = FeatureProcessorC.Min(hFeature, hFeatureMin)

        logging.info('q [%s] max-min feature score get', ntpath.basename(QDir))
        logging.info('q [%s] max %s', ntpath.basename(QDir),
                     json.dumps(hFeatureMax))
        logging.info('q [%s] min %s', ntpath.basename(QDir),
                     json.dumps(hFeatureMin))
        return hFeatureMax, hFeatureMin

    def ProcessOneDoc(self, Qid, DocInName, hFeatureMax, hFeatureMin):
        '''
        read data
        hash to node id
        normalize
        fetch rel label
        dump node mtx
        dump edge tensor
        dump rel label
        dump node name -> id
        '''

        lLines = open(DocInName).read().splitlines()
        lNodeLines = [line for line in lLines if self.IsNodeFeatureLine(line)]
        lEdgeLines = [
            line for line in lLines if not self.IsNodeFeatureLine(line)
        ]

        hNodeId = self.HashPerDocNode(lNodeLines)

        NodeMtx = self.FormNodeMtx(lNodeLines, hNodeId, hFeatureMax,
                                   hFeatureMin)
        EdgeTensor = self.FormEdgeTensor(lEdgeLines, hNodeId, hFeatureMax,
                                         hFeatureMin)

        DocNo = ntpath.basename(DocInName)
        rel = self.RelCenter.GetScore(Qid, DocNo)

        OutName = self.OutDir + '/' + Qid + '/' + DocNo
        if not os.path.exists(self.OutDir + '/' + Qid):
            os.makedirs(self.OutDir + '/' + Qid)
        out = open(OutName, 'w')

        pickle.dump([NodeMtx, EdgeTensor, rel, hNodeId], out)

        logging.info('[%s] processed and dumped', OutName)

        return True

    def HashPerDocNode(self, lLines):

        lNode = []
        QNode = ""
        for line in lLines:
            vCol = line.split('\t')
            for NodeName in vCol[:2]:
                if self.IsObjNode(NodeName):
                    lNode.append(NodeName)
                if self.IsQNode(NodeName):
                    QNode = NodeName

        lNode = list(set(lNode))
        lNode.sort()
        lTotalNode = [QNode] + lNode

        hNodeId = dict(zip(lTotalNode, range(len(lTotalNode))))

        return hNodeId

    def FormNodeMtx(self, lNodeLines, hNodeId, hFeatureMax, hFeatureMin):
        '''
        make lines to node id, hFeature pair
        normalize hFeature
        put it in corresponding rows in NodeMtx
        '''
        NodeMtx = numpy.zeros([len(hNodeId), len(self.hNodeFeatureId)])

        for line in lNodeLines:
            vCol = line.split('\t')
            NodeP = hNodeId[vCol[0]]

            hFeature = json.loads(vCol[-1])

            hFeature = FeatureProcessorC.MaxMinNormalization(
                hFeature, hFeatureMax, hFeatureMin)

            FeatureVec = FeatureProcessorC.VectorlizeFeature(
                hFeature, self.hNodeFeatureId)

            NodeMtx[NodeP] = FeatureVec

        logging.info('node feature matrix converted')

        return NodeMtx

    def FormEdgeTensor(self, lEdgeLines, hNodeId, hFeatureMax, hFeatureMin):
        '''
        make lines to node a, node b, hFeature triple
        normalize
        put it in corresponding cell in EdgeTensor
        '''

        EdgeTensor = numpy.zeros(
            [len(hNodeId),
             len(hNodeId),
             len(self.hEdgeFeatureId)])

        for line in lEdgeLines:
            vCol = line.split('\t')
            NodeA = hNodeId[vCol[0]]
            NodeB = hNodeId[vCol[1]]
            hFeature = json.loads(vCol[2])

            hFeature = FeatureProcessorC.MaxMinNormalization(
                hFeature, hFeatureMax, hFeatureMin)

            FeatureVec = FeatureProcessorC.VectorlizeFeature(
                hFeature, self.hEdgeFeatureId)

            EdgeTensor[NodeA, NodeB] = FeatureVec

        logging.info('edge feature tensor converted')

        return EdgeTensor

    def Process(self):

        hGlobalFeatureMax, hGlobalFeatureMin = self.FindGlobalFeatureMaxMin()

        self.HashFeatureName(hGlobalFeatureMax)
        for QDir, mid, lDocName in os.walk(self.InDir):
            if QDir == self.InDir:
                continue
            logging.info('start working on query dir [%s]', QDir)
            #             hFeatureMax,hFeatureMin = self.FindMaxMinFeatureValuesForQ(QDir)
            qid = ntpath.basename(QDir)
            for DocName in lDocName:
                self.ProcessOneDoc(qid, QDir + '/' + DocName,
                                   hGlobalFeatureMax, hGlobalFeatureMin)

            logging.info('q [%s] processed', qid)

        self.DumpFeatureHash()

        logging.info('feature normalized and transformed')
        return True

    def DumpFeatureHash(self):
        out = open(self.OutDir + 'NodeFeatureId', 'w')
        lNodeF = self.hNodeFeatureId.items()
        lNodeF.sort(key=lambda item: int(item[1]))
        print >> out, '\n'.join(
            ['%s\t%s' % (item[0], item[1]) for item in lNodeF])
        out.close()

        out = open(self.OutDir + 'EdgeFeatureId', 'w')
        lEdgeF = self.hEdgeFeatureId.items()
        lEdgeF.sort(key=lambda item: int(item[1]))
        print >> out, '\n'.join(
            ['%s\t%s' % (item[0], item[1]) for item in lEdgeF])
        out.close()
        logging.info('feature id name dumped')
        return

    def IsQNode(self, NodeName):
        return NodeName.startswith('q_')

    def IsObjNode(self, NodeName):
        return NodeName.startswith('/m/')

    def IsNodeFeatureLine(self, line):
        vCol = line.split('\t')
        if self.IsQNode(vCol[1]) | self.IsObjNode(vCol[1]):
            return False
        return True
Пример #7
0
class GraphDataPreparationcC(cxBaseC):
    def Init(self):
        cxBaseC.Init(self)
        self.InDir = ""
        self.OutDir = ""

        self.QRelCenter = AdhocQRelC()
        self.hQueryQid = {}  #query name -> qid

    def SetConf(self, ConfIn):
        cxBaseC.SetConf(self, ConfIn)
        self.InDir = self.conf.GetConf('indir')
        self.OutDir = self.conf.GetConf('outdir')
        QRelInName = self.conf.GetConf('qrel')
        self.QRelCenter.Load(QRelInName)

        QIn = self.conf.GetConf('qin')
        self.LoadQueryQid(QIn)

    @staticmethod
    def ShowConf():
        cxBaseC.ShowConf()
        print 'indir\noutdir\nqrelnqin'

    def LoadQueryQid(self, QIn):
        lQidQuery = [
            line.split('\t') for line in open(QIn).read().splitlines()
        ]
        lQueryNameQid = [[
            IndriSearchCenterC.GenerateQueryTargetName(item[1]), item[0]
        ] for item in lQidQuery]
        self.hQueryQid = dict(lQueryNameQid)

    def UpdateHashId(self, name, hDict):
        if not name in hDict:
            hDict[name] = len(hDict)

    def GeneratePerQHashMapping(self, InName):
        hNodeId = {}
        lEdgeFeatureName = []
        for line in open(InName):
            NodeA, NodeB, FeatureStr = line.strip().split('\t')
            self.UpdateHashId(NodeA, hNodeId)
            self.UpdateHashId(NodeB, hNodeId)

            hFeature = json.loads(FeatureStr)
            lEdgeFeatureName.extend(hFeature.keys())

        lEdgeFeatureName = list(set(lEdgeFeatureName))
        lEdgeFeatureName.sort()  #make sure feature id is uniq
        hEdgeFeatureId = dict(
            zip(lEdgeFeatureName, range(len(lEdgeFeatureName))))

        logging.info('[%s] id made [%d] node [%d] edge feature', InName,
                     len(hNodeId), len(hEdgeFeatureId))
        return hNodeId, hEdgeFeatureId

    def FormGraphTensorPerFile(self, InName, hNodeId, hEdgeFeatureId):
        '''
        form tensor for data in InName
        '''

        NodeN = len(hNodeId)
        FeatureDim = len(hEdgeFeatureId)
        logging.info('initializing [%d^2,-%d] graph tensor', NodeN, FeatureDim)
        GraphTensor = np.zeros((NodeN, NodeN, FeatureDim))

        for line in open(InName):
            NodeA, NodeB, FeatureStr = line.strip().split('\t')
            hFeature = json.loads(FeatureStr)

            AId = hNodeId[NodeA]
            BId = hNodeId[NodeB]
            for key, score in hFeature.items():
                FId = hEdgeFeatureId[key]
                GraphTensor[AId, BId, FId] = score

        return GraphTensor

    def FetchQRelVec(self, hNodeId, qid):
        '''
        fetch the relevance score from self.QRelCenter
        if the node is a query or a object, then rel score is np.nan
        '''

        QRelVec = np.zeros(len(hNodeId))
        for name, p in hNodeId:
            if not name.startswith('clueweb'):
                QRelVec[p] = np.nan
                continue

            QRelVec[p] = self.QRelCenter.GetScore(qid, name)

        return QRelVec

    def ProcessOneQuery(self, InName):
        QName = ntpath.basename(InName)
        qid = self.hQueryQid[QName]
        OutPre = self.OutDir + '/' + qid

        hNodeId, hEdgeFeatureId = self.GeneratePerQHashMapping(InName)

        pickle.dump(hNodeId, open(OutPre + '_NodeId', 'w'))
        pickle.dump(hEdgeFeatureId, open(OutPre + '_EdgeFeatureId', 'w'))
        logging.info('[%s] hash id dumped', QName)

        GraphTensor = self.FormGraphTensorPerFile(InName, hNodeId,
                                                  hEdgeFeatureId)
        pickle.dump(GraphTensor, open(OutPre + '_Graph', 'w'))
        logging.info('[%s] graph tensor dumped', QName)

        QRelVec = self.FetchQRelVec(hNodeId, qid)
        pickle.dump(QRelVec, open(OutPre + '_Label', 'w'))
        logging.info('[%s] label vec dumped', QName)

    @staticmethod
    def LoadOneQuery(InPre):
        GraphTensor = pickle.load(open(InPre + '_Graph'))
        QRelVec = pickle.load(open(InPre + '_Label'))
        return GraphTensor, QRelVec

    @staticmethod
    def LoadData(InDir):
        lInName = WalkDir(InDir)
        lInName = list(
            set(['_'.join(line.split('_')[:-1]) for line in lInName]))
        lInName.sort(key=lambda item: int(ntpath.basename(item)))

        lGraph = []
        lLabel = []

        for InName in lInName:
            GraphTensor, QRelVec = GraphDataPreparationcC.LoadOneQuery(InName)
            lGraph.append(GraphTensor)
            lLabel.append(QRelVec)
            logging.info('[%s] data loaded', ntpath.basename(InName))

        logging.info('add graph data and label loaded [%d] query', len(lGraph))
        return lGraph, lLabel

    def Process(self):
        lInName = WalkDir(self.InDir)

        for InName in lInName:
            self.ProcessOneQuery(InName)

        logging.info('finished, data in [%s]', self.OutDir)