class SearchResDocGraphConstructorC(DocGraphConstructorC): def Init(self): DocGraphConstructorC.Init(self) self.Searcher = IndriSearchCenterC() def SetConf(self, ConfIn): DocGraphConstructorC.SetConf(self, ConfIn) self.Searcher.SetConf(ConfIn) @staticmethod def ShowConf(): DocGraphConstructorC.ShowConf() IndriSearchCenterC.ShowConf() def FormForOneQ(self, qid, query): lDoc = self.Searcher.RunQuery(query, qid) lDocKg = [self.GraphFormer.FillDocGraph(doc.DocNo) for doc in lDoc] QueryOutDir = self.OutDir + '/' + qid if not os.path.exists(QueryOutDir): os.makedirs(QueryOutDir) for DocKg in lDocKg: DocKg.dump(QueryOutDir + '/' + DocKg.DocNo) logging.debug('[%s] dummped [%d] node', DocKg.DocNo, len(DocKg)) logging.info('[%s-%s] doc kg formed', qid, query) return True def Process(self, QInName): lQidQuery = [ line.split('\t') for line in open(QInName).read().splitlines() ] for qid, query in lQidQuery: self.FormForOneQ(qid, query) logging.info('[%s] query finished', QInName) return True
class LESRanker(cxBaseC): def Init(self): cxBaseC.Init(self) self.Searcher = IndriSearchCenterC() self.ObjCenter = FbObjCacheCenterC() self.Evaluator = AdhocEvaC() self.Inferener = LESInferencerC() self.QDocNodeDataDir = "" self.OrigQWeight = 0.5 self.UseQObjOnly = True @classmethod def ShowConf(cls): cxBaseC.ShowConf() IndriSearchCenterC.ShowConf() FbObjCacheCenterC.ShowConf() AdhocEvaC.ShowConf() print 'qdocnodedatadir\norigqweight 0.5\nqobjonly 1' def SetConf(self, ConfIn): cxBaseC.SetConf(self, ConfIn) self.Searcher.SetConf(ConfIn) self.Evaluator.SetConf(ConfIn) self.ObjCenter.SetConf(ConfIn) self.QDocNodeDataDir = self.conf.GetConf('qdocnodedatadir') + '/' self.OrigQWeight = self.conf.GetConf('origqweight', self.OrigQWeight) self.UseQObjOnly = bool(self.conf.GetConf('qobjonly', 1)) def LoadQDocObj(self, query): InName = self.QDocNodeDataDir + IndriSearchCenterC.GenerateQueryTargetName( query) hQDocObj = {} for line in open(InName): key, ObjId = line.strip().split('\t') if not key in hQDocObj: hQDocObj[key] = [ObjId] else: hQDocObj[key].append(ObjId) logging.info('query [%s] q doc obj [%d] loaded', query, len(hQDocObj)) return hQDocObj def RankingForOneQ(self, qid, query): logging.info('Start LES ranking for [%s-%s]', qid, query) lDoc = self.Searcher.RunQuery(query, qid) logging.info('doc fetched') hQDocObj = self.LoadQDocObj(query) QKey = 'q_%s' % (qid) if not QKey in hQDocObj: #do nothing logging.info('query [%s] has no object, return raw raning', qid) return [doc.DocNo for doc in lDoc] lQObj = [self.ObjCenter.FetchObj(ObjId) for ObjId in hQDocObj[QKey]] lDocLESScore = [] LesCnt = 0 for doc in lDoc: if self.UseQObjOnly: lDocObj = lQObj else: if not doc.DocNo in hQDocObj: lDocLESScore.append(0) continue lDocObj = [ self.ObjCenter.FetchObj(ObjId) for ObjId in hQDocObj[doc.DocNo] ] score = self.Inferener.inference(query, doc, lQObj, lDocObj) if score != 0: #if 0, means the obj has no desp (or very short one), doesn't count as valid score LesCnt += 1 lDocLESScore.append(score) #add average score to doc without annotation #using zero is not very proper AvgScore = sum(lDocLESScore) / float(LesCnt) lDocLESScore = [ item if item != 0 else AvgScore for item in lDocLESScore ] lScore= [self.OrigQWeight * math.exp(doc.score) + (1-self.OrigQWeight) * LESScore \ for doc,LESScore in zip(lDoc,lDocLESScore)] lDocNoScore = zip([doc.DocNo for doc in lDoc], lScore) lDocNoScore.sort(key=lambda item: item[1], reverse=True) lRankedDocNo = [item[0] for item in lDocNoScore] logging.info('query [%s] ranked', qid) return lRankedDocNo def Process(self, QIn, OutName): lQidQuery = [ line.split('\t') for line in open(QIn).read().splitlines() ] llDocNo = [self.RankingForOneQ(qid, query) for qid, query in lQidQuery] logging.info('start evaluation') lQid = [item[0] for item in lQidQuery] lQuery = [item[1] for item in lQidQuery] lPerQEvaRes = self.Evaluator.EvaluateFullRes(lQid, lQuery, llDocNo) out = open(OutName, 'w') for qid, EvaRes in lPerQEvaRes: print >> out, qid + '\t' + EvaRes.dumps() out.close() logging.info('%s %s', lPerQEvaRes[-1][0], lPerQEvaRes[-1][1].dumps()) return True
class LeToRFeatureExtractCenterC(cxBaseC): def Init(self): cxBaseC.Init(self) self.Prepared = False self.Word2VecInName = "" self.Word2VecModel = None self.lFeatureGroup = [] self.Searcher = IndriSearchCenterC() self.GivenFeatureExtractor = LeToRGivenFeatureExtractorC() self.EmbTermPairFeatureExtractor = EmbeddingTermPairFeatureExtractorC() self.EmbLmFeatureExtractor = EmbeddingLmFeatureExtractorC() self.QRelCenter = AdhocQRelC() self.QRelIn = "" def SetConf(self, ConfIn): cxBaseC.SetConf(self, ConfIn) self.Word2VecInName = self.conf.GetConf('word2vecin') self.lFeatureGroup = self.conf.GetConf('featuregroup') self.QRelIn = self.conf.GetConf('qrel') self.QRelCenter.Load(self.QRelIn) if type(self.lFeatureGroup) != list: self.lFeatureGroup = [self.lFeatureGroup] self.Searcher.SetConf(ConfIn) if 'givenfeature' in self.lFeatureGroup: self.GivenFeatureExtractor.SetConf(ConfIn) if 'termpairemb' in self.lFeatureGroup: self.EmbTermPairFeatureExtractor.SetConf(ConfIn) if 'emblm' in self.lFeatureGroup: self.EmbLmFeatureExtractor.SetConf(ConfIn) return True @staticmethod def ShowConf(): cxBaseC.ShowConf() print 'word2vecin\nfeaturegroup givenfeature|termpairemb\nqrel\nemblm' LeToRGivenFeatureExtractorC.ShowConf() EmbeddingTermPairFeatureExtractorC.ShowConf() EmbeddingLmFeatureExtractorC.ShowConf() IndriSearchCenterC.ShowConf() def Prepare(self): if self.Prepared: return logging.info('start load word2vec input') self.Word2VecModel = gensim.models.Word2Vec.load_word2vec_format(self.Word2VecInName) logging.info('word2vec loaded') if 'givenfeature' in self.lFeatureGroup: self.GivenFeatureExtractor.Prepare() if 'termpairemb' in self.lFeatureGroup: self.EmbTermPairFeatureExtractor.Prepare() if 'emblm' in self.lFeatureGroup: self.EmbLmFeatureExtractor.Prepare() self.Prepared = True return def Process(self, qid,query,doc): ''' extract all features here ''' self.Prepare() hFeature = {} logging.debug('extracting for [%s][%s]',qid,doc.DocNo) if 'givenfeature' in self.lFeatureGroup: hFeature.update(self.GivenFeatureExtractor.Extract(qid, query, doc)) logging.debug('given feature extracted') if 'termpairemb' in self.lFeatureGroup: hFeature.update(self.EmbTermPairFeatureExtractor.Extract(qid, query, doc, self.Word2VecModel)) logging.debug('termpairemb feature extracted') if 'emblm' in self.lFeatureGroup: hFeature.update(self.EmbLmFeatureExtractor.Extract(qid, query, doc, self.Word2VecModel)) logging.debug('emblm feature extracted') return hFeature def PipeLineRun(self,QInName,OutName): ''' will make a feature hash myself... It should be OK right? ''' hFeatureName = {} self.Prepare() lLines = open(QInName).read().splitlines() lQidQuery = [line.split('\t') for line in lLines] out = open(OutName,'w') logging.info('start extracting for file [%s]',QInName) for qid,query in lQidQuery: lDoc = self.Searcher.RunQuery(query, qid) for doc in lDoc: hFeature = self.Process(qid, query, doc) LTRData = LeToRDataBaseC() LTRData.qid = qid LTRData.DocNo = doc.DocNo LTRData.hFeature = hFeature LTRData.score = self.QRelCenter.GetScore(qid, doc.DocNo) hFeatureName = LTRData.HashFeatureName(hFeatureName) print >>out,LTRData.dumps() logging.info('qid [%s] extracted',qid) out.close() NameOut = open(OutName + '_FeatureName','w') for name,Id in hFeatureName.items(): print >>NameOut,'%d\t%s' %(Id,name) NameOut.close() logging.info('finished') return
class DocAnaResSERPSplitterC(cxBaseC): def Init(self): cxBaseC.Init(self) self.Searcher = IndriSearchCenterC() self.hDocAnaData = {} self.hDocText = {} self.OutDir = '' self.QInName = "" def SetConf(self, ConfIn): cxBaseC.SetConf(self, ConfIn) self.Searcher.SetConf(ConfIn) DocAnaIn = self.conf.GetConf('docanain') DocTextIn = self.conf.GetConf('doctextin') self.ReadDocAna(DocAnaIn, DocTextIn) self.OutDir = self.conf.GetConf('outdir') self.QInName = self.conf.GetConf('in') @staticmethod def ShowConf(): cxBaseC.ShowConf() print 'docanain\noutdir\nin\ndoctextin' IndriSearchCenterC.ShowConf() def ReadDocAna(self, DocAnaIn, DocTextIn): lLines = open(DocAnaIn).read().splitlines() lDict = [[line.split()[0], line] for line in lLines] self.hDocAnaData = dict(lDict) lLines = open(DocTextIn).read().splitlines() lDict = [line.split('#')[0].strip().split('\t') for line in lLines] self.hDocText = dict(lDict) return True def DumpOneQ(self, qid, query): lDoc = self.Searcher.RunQuery(query, qid) out = open(self.OutDir + '/%s' % (query.replace(' ', '_')), 'w') for doc in lDoc: if (not doc.DocNo in self.hDocAnaData) | (not doc.DocNo in self.hDocText): continue print >> out, "<doc>" line = self.hDocAnaData[doc.DocNo] vCol = line.split('\t') text = self.hDocText[doc.DocNo] print >> out, vCol[0] + '\t' + text if len(vCol) > 2: vAna = vCol[1:] for i in range(len(vAna) / 8): print >> out, '\t'.join(vAna[8 * i:8 * i + 8]) print >> out, "</doc>\n\n\n" out.close() logging.info('[%s] data dumped', query) return True def Process(self): lQidQuery = [ line.split('\t') for line in open(self.QInName).read().splitlines() ] for qid, query in lQidQuery: self.DumpOneQ(qid, query) logging.info('finished')
class NodeCollectorCenterC(cxBaseC): def Init(self): cxBaseC.Init(self) self.Searcher = IndriSearchCenterC() self.QueryNodePreFetchedCollector = QueryPreFetchedNodeCollectorC() self.DocNodeFaccAnaCollector = DocNodeFaccAnaCollectorC() self.lQueryNodeGroup = [] self.lDocNodeGroup = [] def SetConf(self, ConfIn): cxBaseC.SetConf(self, ConfIn) self.lQueryNodeGroup = self.conf.GetConf('querynodegroup', self.lQueryNodeGroup) self.lDocNodeGroup = self.conf.GetConf('docnodegroup', self.lDocNodeGroup) self.Searcher.SetConf(ConfIn) if 'ana' in self.lQueryNodeGroup: self.QueryNodePreFetchedCollector.SetConf(ConfIn) if 'facc' in self.lDocNodeGroup: self.DocNodeFaccAnaCollector.SetConf(ConfIn) logging.info('node collector center conf set') return @staticmethod def ShowConf(): cxBaseC.ShowConf() QueryPreFetchedNodeCollectorC.ShowConf() DocNodeFaccAnaCollectorC.ShowConf() IndriSearchCenterC.ShowConf() print 'querynodegroup ana' print 'docnodegroup facc' def process(self, qid, query): ''' retrieval lDoc call query node generator call doc node generator ''' lDoc = self.Searcher.RunQuery(query, qid) lQObj = self.CollectQueryNode(qid, query) llDocObj = self.CollectDocNode(lDoc, qid, query) logging.info('[%s][%s] node collected', qid, query) return lDoc, lQObj, llDocObj def CollectQueryNode(self, qid, query): lQNodeScore = [] if 'ana' in self.lQueryNodeGroup: lQNodeScore.extend( self.QueryNodePreFetchedCollector.process(qid, query)) lQObj = list(set([item[0] for item in lQNodeScore])) return lQObj def CollectDocNode(self, lDoc, qid, query): llDocObj = [] if 'facc' in self.lDocNodeGroup: llDocNodeScore = self.DocNodeFaccAnaCollector.process( lDoc, qid, query) llDocObj = [ list(set([item[0] for item in lDocNodeScore])) for lDocNodeScore in llDocNodeScore ] # for lDocNodeScore in llDocNodeScore: # lDocObj = [item[0] for item in lDocNodeScore] # lDocObj = list(set(lDocObj)) # llDocObj.append(lDocObj) return llDocObj def PipeRun(self, QInName, OutName, OutFormat='json'): ''' read qid,query run output to out name each line a json dumped [qid,query,lDoc,lQObj,lDocObj] ''' lQidQuery = [ line.split('\t') for line in open(QInName).read().splitlines() ] if OutFormat == 'json': out = open(OutName, 'w') for qid, query in lQidQuery: lDoc, lQObj, llDocObj = self.process(qid, query) if OutFormat == 'json': print >> out, json.dumps([qid, query, lDoc, lQObj, llDocObj]) if OutFormat == 'dir': #print doc id\t obj id (doc id could be query indicating query obj) self.DumpRawFormat(qid, query, lDoc, lQObj, llDocObj, OutName) if OutFormat == 'json': out.close() logging.info('query in [%s] node genereated, dumped to [%s]', QInName, OutName) def DumpRawFormat(self, qid, query, lDoc, lQObj, llDocObj, OutName): if not os.path.exists(OutName): os.makedirs(OutName) out = open( OutName + '/' + IndriSearchCenterC.GenerateQueryTargetName(query), 'w') logging.info('q[%s] has [%d] q node', qid, len(lQObj)) for QObj in lQObj: print >> out, 'q_' + qid + '\t' + QObj if llDocObj == []: logging.info('no doc node') else: for doc, lDocObj in zip(lDoc, llDocObj): logging.info('doc [%s] has [%d] node', doc.DocNo, len(lDocObj)) for DocObj in lDocObj: print >> out, doc.DocNo + '\t' + DocObj out.close() logging.info('q [%s] raw node res dumpped', qid) return @staticmethod def LoadRawFormatNodeRes(query, InDir): ''' read results from the disk as dumped ''' lDocNo = [] llDocObj = [] InName = InDir + '/' + IndriSearchCenterC.GenerateQueryTargetName( query) lLines = open(InName).read().splitlines() lvCol = [line.split('\t') for line in lLines] lQCol = [vCol for vCol in lvCol if vCol[0].startswith('q_')] lDocCol = [vCol for vCol in lvCol if not vCol[0].startswith('q_')] lQObj = [vCol[1] for vCol in lQCol] logging.debug('q[%s] get q obj %s', query, json.dumps(lQObj)) LastDocNo = "" for DocNo, ObjId in lDocCol: if not DocNo == LastDocNo: llDocObj.append([]) lDocNo.append(DocNo) LastDocNo = DocNo llDocObj[-1].append(ObjId) return lDocNo, lQObj, llDocObj
class ContinuousLmRankingEvaluatorC(cxBaseC): def Init(self): cxBaseC.Init(self) self.Evaluator = AdhocEvaC() self.Searcher = IndriSearchCenterC() self.Word2VecInName = "" self.Word2VecModel = None self.lLmName = [] self.LmClass = None self.lOutName = [] self.QueryInName = "" def SetConf(self, ConfIn): cxBaseC.SetConf(self, ConfIn) self.Searcher.SetConf(ConfIn) self.Evaluator.SetConf(ConfIn) self.lLmName = self.conf.GetConf('lmname', self.lLmName) self.QueryInName = self.conf.GetConf('in') self.lOutName = self.conf.GetConf('out', self.lOutName) self.Word2VecInName = self.conf.GetConf('word2vecin', self.Word2VecInName) self.LoadWord2Vec() def LoadWord2Vec(self): logging.info('start load word2vec input') self.Word2VecModel = gensim.models.Word2Vec.load_word2vec_format( self.Word2VecInName) logging.info('word2vec loaded') @classmethod def ShowConf(cls): cxBaseC.ShowConf() print cls.__name__ print 'word2vecin\nkernel\nlmname\nbandwidth\nin\nout' IndriSearchCenterC.ShowConf() AdhocEvaC.ShowConf() def ReRankAndEvaPerQ(self, qid, query, lDoc, lLm): lReRankDocNo, lScore = self.FormNewRank(query, lDoc, lLm) EvaRes = self.Evaluator.EvaluatePerQ(qid, query, lReRankDocNo) logging.info('[%s][%s] result [%s]', qid, query, EvaRes.dumps()) return EvaRes, lReRankDocNo, lScore def FormNewRank(self, query, lDoc, lLm): lQTerm = query.split() if [] == lQTerm: return self.MinLogPdf lQX = [ self.Word2VecModel[term] for term in lQTerm if term in self.Word2VecModel ] lScore = [lm.InferenceQVec(lQX) for lm in lLm] lDocScore = zip(lDoc, lScore) lDocScore.sort(key=lambda item: item[1], reverse=True) lDocNo = [item[0].DocNo for item in lDocScore] lScore = [item[1] for item in lDocScore] return lDocNo, lScore def FormLm(self, doc): lTerm = doc.GetContent().split() Lm = self.LmClass() Lm.SetPara(self.conf) Lm.Construct(lTerm, self.Word2VecModel) return Lm def FormPerQData(self, qid, query): lDoc = self.Searcher.RunQuery(query, qid) lLm = [self.FormLm(doc) for doc in lDoc] return lDoc, lLm def SetLmClass(self, cLmName): ''' select proper class name for cLmName ''' if cLmName == 'gaussian': logging.info('use gaussian clm') self.LmClass = GaussianLmC return True if cLmName == 'kde': logging.info('use kde lm') self.LmClass = KernelDensityLmC return True if cLmName == 'sum': logging.info('use raw sum') self.LmClass = SummationLmC return True if cLmName == 'rand': logging.info('use rand') self.LmClass = RandLmC return True if cLmName == 'radius': logging.info('use radius') self.LmClass = RadiusMatchLmC return True raise NotImplementedError( 'please choose continuous language model from gaussian|kde') def Process(self): for OutName, cLmName in zip(self.lOutName, self.lLmName): self.RunForOneLm(self.QueryInName, OutName, cLmName) def RunForOneLm(self, QueryInName, OutName, cLmName): ''' evaluate cLmName on QueryInName's queries evaluation result output to OutName ''' lQidQuery = [ line.split('\t') for line in open(QueryInName).read().splitlines() ] self.SetLmClass(cLmName) lEvaRes = [] RankOut = open(OutName + '_rank', 'w') logging.info('start evaluating...') for qid, query in lQidQuery: lDoc, lLm = self.FormPerQData(qid, query) EvaRes, lDocNo, lScore = self.ReRankAndEvaPerQ( qid, query, lDoc, lLm) lEvaRes.append(EvaRes) for i in range(len(lDocNo)): print >> RankOut, qid + ' Q0 ' + lDocNo[i] + ' %d %f %s' % ( i + 1, lScore[i], cLmName) RankOut.close() lEvaRes.append(AdhocMeasureC.AdhocMeasureMean(lEvaRes)) lQid = [item[0] for item in lQidQuery] + ['mean'] out = open(OutName, 'w') for qid, EvaRes in zip(lQid, lEvaRes): print >> out, qid + '\t' + EvaRes.dumps() out.close() logging.info('evaluation res %s', lEvaRes[-1].dumps()) return True
class GraphFullFeatureExtractCenterC(cxBaseC): def Init(self): cxBaseC.Init(self) self.NodeDir = "" self.Searcher = IndriSearchCenterC() self.ObjCenter = FbObjCacheCenterC() self.QDocFeatureExtractor = LeToRFeatureExtractCenterC() self.QObjFeatureExtractor = FbQObjFeatureExtractCenterC() self.DocObjFeatureExtractor = FbObjDocFeatureExtractCenterC() self.ObjObjFeatureExtractor = ObjObjFeatureExtractCenterC() def SetConf(self, ConfIn): cxBaseC.SetConf(self, ConfIn) self.NodeDir = self.conf.GetConf('nodedir') + '/' self.Searcher.SetConf(ConfIn) self.ObjCenter.SetConf(ConfIn) self.QDocFeatureExtractor.SetConf(ConfIn) self.QObjFeatureExtractor.SetConf(ConfIn) self.DocObjFeatureExtractor.SetConf(ConfIn) self.ObjObjFeatureExtractor.SetConf(ConfIn) logging.info('graph full feature extractor conf setted') @classmethod def ShowConf(cls): cxBaseC.ShowConf() print cls.__name__ print 'nodedir' IndriSearchCenterC.ShowConf() FbObjCacheCenterC.ShowConf() LeToRFeatureExtractCenterC.ShowConf() FbQObjFeatureExtractCenterC.ShowConf() FbObjDocFeatureExtractCenterC.ShowConf() ObjObjFeatureExtractCenterC.ShowConf() def FormulateNodes(self, qid, query): ''' get ldoc and read lObjId fill lObjId ''' logging.info('formulating node for q [%s][%s]', qid, query) lDoc = self.Searcher.RunQuery(query, qid) lDocNo, lQObjId, llDocObjId = NodeCollectorCenterC.LoadRawFormatNodeRes( query, self.NodeDir) #match lDoc dim lDocNo dim lDoc = IndriSearchCenterC.RearrangeDocOrder(lDoc, lDocNo) lQObj = [self.ObjCenter.FetchObj(ObjId) for ObjId in lQObjId] llDocObj = [[self.ObjCenter.FetchObj(ObjId) for ObjId in lDocObjId] for lDocObjId in llDocObjId] while len(llDocObj) < len(lDoc): #add empty list for docs have no objects (thus will restrict to EsdRank) #if lQObj is also empty, then it is LeToR llDocObj.append([]) logging.info('q[%s] all node fetched, q node %s', qid, json.dumps([Obj.GetId() for Obj in lQObj])) return lDoc, lQObj, llDocObj def Process(self, qid, query, OutDir): ''' ''' lDoc, lQObj, llDocObj = self.FormulateNodes(qid, query) for doc, lDocObj in zip(lDoc, llDocObj): hQDocFeature, lhQObjFeature, lhDocObjFeature, llhObjObjFeature = self.ExtractFeatureForOneQDoc( qid, query, doc, lQObj + lDocObj) self.DumpPerQRes(qid, query, doc, lQObj + lDocObj, hQDocFeature, lhQObjFeature, lhDocObjFeature, llhObjObjFeature, OutDir) logging.info('q [%s] processed') return True def PipeRun(self, QInName, OutDir): lQidQuery = [ line.split('\t') for line in open(QInName).read().splitlines() ] for qid, query in lQidQuery: self.Process(qid, query, OutDir) logging.info('queries in [%s] processed features at [%s]', QInName, OutDir) return True def ExtractFeatureForOneQDoc(self, qid, query, doc, lObj): #if wanna speed up, cache features #for clearity, now just extract multiple times hQDocFeature = self.QDocFeatureExtractor.Process(qid, query, doc) logging.debug('q[%s][%s] ltr feature extracted', query, doc.DocNo) lhQObjFeature = self.QObjFeatureExtractor.ProcessOneQuery([qid, query], lObj) logging.debug('q[%s][%s] obj feature extracted', query, doc.DocNo) lhDocObjFeature = self.DocObjFeatureExtractor.ProcessOneQueryDocPair( [qid, query], doc, lObj) logging.debug('q[%s][%s] doc obj feature extracted', query, doc.DocNo) llhObjObjFeature = self.ObjObjFeatureExtractor.Process( qid, query, lObj) #symetric matrix logging.debug('q[%s] [%s] obj obj feature extracted', query, doc.DocNo) logging.debug('q [%s][%s] all doc graph feature extracted', query, doc.DocNo) return hQDocFeature, lhQObjFeature, lhDocObjFeature, llhObjObjFeature def DumpPerQRes(self, qid, query, doc, lObj, hQDocFeature, lhQObjFeature, lhDocObjFeature, llhObjObjFeature, OutDir): ''' raw: a dir for this q a file for each doc node a, node b, hFeature.json ''' if not os.path.exists(OutDir + '/' + qid): os.makedirs(OutDir + '/' + qid) OutName = OutDir + '/' + qid + '/' + doc.DocNo out = open(OutName, 'w') #q doc print >> out, 'q_%s' % (qid) + '\t' + doc.DocNo + '\t' + json.dumps( hQDocFeature) #obj doc for Obj, hDocObjFeature in zip(lObj, lhDocObjFeature): print >> out, Obj.GetId() + '\t' + doc.DocNo + '\t' + json.dumps( hDocObjFeature) #q obj for Obj, hQObjFeature in zip(lObj, lhQObjFeature): print >> out, 'q_%s' % ( qid) + '\t' + Obj.GetId() + '\t' + json.dumps(hQObjFeature) print >> out, Obj.GetId() + '\t' + 'q_%s' % ( qid) + '\t' + json.dumps(hQObjFeature) #make it symmetric #obj obj for i in range(len(lObj)): for j in range(len(lObj)): if i == j: continue print >> out, lObj[i].GetId() + '\t' + lObj[j].GetId( ) + '\t' + json.dumps(llhObjObjFeature[i][j]) logging.info('q[%s] doc [%s] graph dumped to file [%s]', qid, doc.DocNo, OutName) return True
class EdgeFeatureExtractCenterC(cxBaseC): def Init(self): cxBaseC.Init(self) self.lQObjFeatureGroup = [] self.lObjObjFeatureGroup = [] self.lDocObjFeatureGroup = [] self.QObjAnaExtractor = QueryObjEdgeFeatureAnaExtractorC() self.DocObjFaccExtractor = DocObjEdgeFeatureFaccExtractorC() self.ObjObjKGExtractor = ObjObjEdgeFeatureKGExtractorC() self.ObjObjPreCalcExtractor = ObjObjEdgeFeaturePreCalcSimExtractorC() self.ObjObjTextSimExtractor = ObjObjEdgeFeatureTextSimExtractorC() self.Searcher = IndriSearchCenterC() self.ObjCenter = FbObjCacheCenterC() self.NodeDir = "" def SetConf(self, ConfIn): cxBaseC.SetConf(self, ConfIn) self.Searcher.SetConf(ConfIn) self.ObjCenter.SetConf(ConfIn) self.NodeDir = self.conf.GetConf('nodedir') + '/' self.lQObjFeatureGroup = self.conf.GetConf('qobjfeaturegroup', self.lQObjFeatureGroup) self.lDocObjFeatureGroup = self.conf.GetConf('docobjfeaturegroup', self.lDocObjFeatureGroup) self.lObjObjFeatureGroup = self.conf.GetConf('objobjfeaturegroup', self.lObjObjFeatureGroup) if 'ana' in self.lQObjFeatureGroup: self.QObjAnaExtractor.SetConf(ConfIn) if 'facc' in self.lDocObjFeatureGroup: self.DocObjFaccExtractor.SetConf(ConfIn) if 'kg' in self.lObjObjFeatureGroup: self.ObjObjKGExtractor.SetConf(ConfIn) if 'precalc' in self.lObjObjFeatureGroup: self.ObjObjPreCalcExtractor.SetConf(ConfIn) if 'textsim' in self.lObjObjFeatureGroup: self.ObjObjTextSimExtractor.SetConf(ConfIn) logging.info('edge feature center confs setted') @staticmethod def ShowConf(): cxBaseC.ShowConf() IndriSearchCenterC.ShowConf() FbObjCacheCenterC.ShowConf() print 'nodedir\nqobjfeaturegroup\ndocobjfeaturegroup\nobjobjfeaturegroup' QueryObjEdgeFeatureAnaExtractorC.ShowConf() DocObjEdgeFeatureFaccExtractorC.ShowConf() ObjObjEdgeFeatureKGExtractorC.ShowConf() ObjObjEdgeFeaturePreCalcSimExtractorC.ShowConf() ObjObjEdgeFeatureTextSimExtractorC.ShowConf() def FormulateNodes(self, qid, query): ''' get ldoc and read lObjId fill lObjId ''' logging.info('formulating node for q [%s][%s]', qid, query) lDoc = self.Searcher.RunQuery(query, qid) lObjId = open(self.NodeDir + IndriSearchCenterC.GenerateQueryTargetName(query)).read( ).splitlines() lObj = [self.ObjCenter.FetchObj(ObjId) for ObjId in lObjId] logging.info('q[%s] [%d] doc [%d] obj', query, len(lDoc), len(lObj)) return lDoc, lObj def ExtractPerQObj(self, qid, query, obj): hFeature = {} logging.debug('start extracting q[%s]-obj[%s] feature', query, obj.GetId()) if 'ana' in self.lQObjFeatureGroup: hFeature.update(self.QObjAnaExtractor.process(qid, query, obj)) logging.debug('q[%s]-obj[%s] feature extracted', query, obj.GetId()) return hFeature def ExtractQObjFeature(self, qid, query, lObj): lhFeature = [] logging.info('start extracting [%s][%s] q-obj feature [%d] obj', qid, query, len(lObj)) for obj in lObj: hFeature = self.ExtractPerQObj(qid, query, obj) lhFeature.append(hFeature) logging.info('q obj feature extracted') return lhFeature def ExtractPerDocObj(self, doc, obj): hFeature = {} logging.debug('start extracting doc[%s]-obj[%s] feature', doc.DocNo, obj.GetId()) if 'facc' in self.lDocObjFeatureGroup: hFeature.update(self.DocObjFaccExtractor.process(doc, obj)) logging.debug('doc[%s]-obj[%s] feature extracted', doc.DocNo, obj.GetId()) return hFeature def ExtractDocObjFeature(self, lDoc, lObj): llhFeature = [] #doc \times obj logging.info('start extract [%d] doc - [%d] obj feature mtx', len(lDoc), len(lObj)) for doc in lDoc: lhFeature = [] for obj in lObj: hFeature = self.ExtractPerDocObj(doc, obj) lhFeature.append(hFeature) llhFeature.append(lhFeature) logging.info('doc obj feature extracted') return llhFeature def ExtractPerObjObj(self, ObjA, ObjB, query): hFeature = {} logging.debug('start extracting for obj pair [%s-%s]', ObjA.GetId(), ObjB.GetId()) if 'kg' in self.lObjObjFeatureGroup: hFeature.update(self.ObjObjKGExtractor.process(ObjA, ObjB)) if 'precalc' in self.lObjObjFeatureGroup: hFeature.update( self.ObjObjPreCalcExtractor.process(ObjA, ObjB, query)) if 'textsim' in self.lObjObjFeatureGroup: hFeature.update(self.ObjObjTextSimExtractor.process(ObjA, ObjB)) logging.debug('obj pair [%s-%s] feature extracted', ObjA.GetId(), ObjB.GetId()) return hFeature def ExtractObjObjFeature(self, lObj, query): llhFeature = [] #obj -> obj, diagonal is empty logging.info('start extract [%d] obj pair feature mtx', len(lObj)) for ObjA in lObj: lhFeature = [] for ObjB in lObj: if ObjA.GetId() == ObjB.GetId(): continue hFeature = self.ExtractPerObjObj(ObjA, ObjB, query) lhFeature.append(hFeature) llhFeature.append(lhFeature) logging.info('obj obj feature extracted') return llhFeature def Process(self, qid, query): lDoc, lObj = self.FormulateNodes(qid, query) logging.info('nodes fetched') lQObjFeature = self.ExtractQObjFeature(qid, query, lObj) llDocObjFeature = self.ExtractDocObjFeature(lDoc, lObj) llObjObjFeature = self.ExtractObjObjFeature(lObj, query) return lDoc, lObj, lQObjFeature, llDocObjFeature, llObjObjFeature def DumpRes(self, OutName, query, lDoc, lObj, lQObjFeature, llDocObjFeature, llObjObjFeature): out = open(OutName, 'w') for obj, hFeature in zip(lObj, lQObjFeature): print >> out, query + '\t' + obj.GetId() + '\t' + json.dumps( hFeature) for doc, lhFeature in zip(lDoc, llDocObjFeature): for obj, hFeature in zip(lObj, lhFeature): print >> out, doc.DocNo + '\t' + obj.GetId( ) + '\t' + json.dumps(hFeature) for ObjA, lhFeature in zip(lObj, llObjObjFeature): for ObjB, hFeature in zip(lObj, lhFeature): print >> out, ObjA.GetId() + '\t' + ObjB.GetId( ) + '\t' + json.dumps(hFeature) out.close() logging.info('query [%s] feature dumped', query) def PipeRun(self, QInName, OutDir): ''' for now: output raw type each file is a query's edge features each line is query|doc|obj \t obj \t json.dumps(hFeature) ''' lQidQuery = [ line.split('\t') for line in open(QInName).read().splitlines() ] for qid, query in lQidQuery: logging.info('start extracting for [%s][%s]', qid, query) lDoc, lObj, lQObjFeature, llDocObjFeature, llObjObjFeature = self.Process( qid, query) OutName = OutDir + '/' + IndriSearchCenterC.GenerateQueryTargetName( query) logging.info('[%s][%s] extracted, dumpping to [%s]', qid, query, OutName) self.DumpRes(OutName, query, lDoc, lObj, lQObjFeature, llDocObjFeature, llObjObjFeature) logging.info('all finished') return
class EntityCorrelationFromTextSimC(cxBaseC): def Init(self): cxBaseC.Init(self) self.Searcher = IndriSearchCenterC() self.ObjCenter = FbObjCacheCenterC() self.NeighborNum = 50 def SetConf(self, ConfIn): cxBaseC.SetConf(self, ConfIn) self.Searcher.SetConf(ConfIn) self.ObjCenter.SetConf(ConfIn) self.NeighborNum = self.conf.GetConf('neighbornum', self.NeighborNum) @staticmethod def ShowConf(): cxBaseC.ShowConf() IndriSearchCenterC.ShowConf() FbObjCacheCenterC.ShowConf() print 'neighbornum' def ProcessOneObj(self, ObjId, name): ''' return lObjNeighbor=[objid,KL score] top self.NeighborNum ''' #search in index, get top 1000 query = TextBaseC.RawClean(name) if "" == query: return [] lObjDoc = self.Searcher.RunQuery(query) lObjNeighbor = [] ThisDesp = self.ObjCenter.FetchObjDesp(ObjId) ThisLm = LmBaseC(ThisDesp) ThisVec = VectorC(ThisLm.hTermTF) # print '[%s:%s] desp : [%s]' %(ObjId,name,ThisDesp) if len(ThisLm.hTermTF) == 0: return [] for ObjDoc in lObjDoc: Id = ObjDoc.DocNo if Id == ObjId: continue if not Id.startswith('/m/'): print "[%s %s] neighbor id [%s] format error" % (ObjId, name, Id) continue # print "get neighbor [%s] [%s]" %(Id,ObjDoc.GetContent()) # NeighborDesp = ObjDoc.GetContent() NeighborLm = LmBaseC(ObjDoc) NeighborVec = VectorC(NeighborLm.hTermTF) if len(NeighborVec.hDim) == 0: continue score = VectorC.KL(ThisVec, NeighborVec) lObjNeighbor.append([Id, -score]) # print "[%s %s] KL [%f]" %(ObjId,Id,score) # print "%s\n%s" %(json.dumps(ThisVec.hDim),json.dumps(NeighborVec.hDim)) lObjNeighbor.sort(key=lambda item: item[1], reverse=True) print "[%s:%s] neighbor id score get" % (ObjId, name) return lObjNeighbor def Process(self, ObjInName, OutName): out = open(OutName, 'w') for line in open(ObjInName): vCol = line.strip().split('\t') if len(vCol) < 2: continue lObjNeighbor = self.ProcessOneObj(vCol[0], vCol[1]) for NeighborId, score in lObjNeighbor[:self.NeighborNum]: print >> out, '%s\t%s\t%f\t%s\t%s' % ( vCol[0], NeighborId, score, vCol[1], self.ObjCenter.FetchObjName(NeighborId)) print "[%s:%s] done" % (vCol[0], vCol[1]) out.close() print "finished"
class SearchResultWordVecAnalysiserC(cxBaseC): def Init(self): cxBaseC.Init(self) self.QIn = "" self.OutDir = "" self.Word2VecInName = "" self.Word2VecModel = None self.Searcher = IndriSearchCenterC() self.BinNumber = 100 def SetConf(self, ConfIn): cxBaseC.SetConf(self, ConfIn) self.Searcher.SetConf(ConfIn) self.Word2VecInName = self.conf.GetConf('word2vecin') self.LoadWord2Vec() self.QIn = self.conf.GetConf('in') self.OutDir = self.conf.GetConf('outdir') self.BinNumber = self.conf.GetConf('binnumber', self.BinNumber) def LoadWord2Vec(self): logging.info('start load word2vec input') self.Word2VecModel = gensim.models.Word2Vec.load_word2vec_format( self.Word2VecInName) logging.info('word2vec loaded') @staticmethod def ShowConf(): cxBaseC.ShowConf() print 'word2vecin\nin\noutdir\nbinnumber' IndriSearchCenterC.ShowConf() def LoadDocWordVec(self): lDoc = [] lQidQuery = [ line.split('\t') for line in open(self.QIn).read().splitlines() ] for qid, query in lQidQuery: lDoc.extend(self.Searcher.RunQuery(query, qid)) lTerm = [] for doc in lDoc: lTerm.extend(doc.GetContent().split()) lX = np.array([ self.Word2VecModel[term] for term in lTerm if term in self.Word2VecModel ]) logging.info('target doc word vec get') return lX def BinData(self, lX, OutName): ''' bin all lX's dim [[mu,sigma, bins]] ''' logging.info('binning data') lBinData = [] dim = lX.shape[1] for i in range(dim): x = lX[:, i] logging.info('binning dim [%d]', i) mu = np.mean(x) sigma = np.var(x) hist, bins = np.histogram(x, bins=self.BinNumber) lBinData.append([mu, sigma, hist, bins]) out = open(OutName, 'w') pickle.dump(lBinData, out) out.close() logging.info('data binned to [%s]', OutName) return def CalcPersonCorrelation(self, lX, OutName): n, d = lX.shape mPValue = np.zeros([d, d]) mPearson = np.zeros([d, d]) for i in range(d): for j in range(i + 1, d): per, p = pearsonr(lX[:, i], lX[:, j]) mPValue[i, j] = p mPValue[j, i] = p mPearson[i, j] = per mPearson[j, i] = per if p < 0.05: logging.info('[%d-%d] correlated p=%f', i, j, p) out = open(OutName + '_pearson', 'w') pickle.dump(mPearson, out) # print >>out, np.array2string(mPearson) out.close() out = open(OutName + '_pvalue', 'w') pickle.dump(mPValue, out) # print >>out, np.array2string(mPValue) out.close() logging.info('pearson corr calculated and dumped') return True def CalcCovarianceMtx(self, lX, OutName): logging.info('start calculating covariance matrix') # CovMtx = np.cov(lX.T) #OOM d = lX.shape[1] CovMtx = np.zeros([d, d]) for i in range(d): for j in range(i, d): MiniCovMtx = np.cov(lX[:, i], lX[:, j]) CovMtx[i, j] = MiniCovMtx[0, 1] CovMtx[i, i] = MiniCovMtx[0, 0] CovMtx[j, i] = MiniCovMtx[1, 0] out = open(OutName, 'w') pickle.dump(CovMtx, out) out.close() logging.info('covariance dumped to [%s]', OutName) def Process(self): lX = self.LoadDocWordVec() # self.BinData(lX, self.OutDir + '/MarginalDist') self.CalcCovarianceMtx(lX, self.OutDir + '/CovarianceMtx') self.CalcPersonCorrelation(lX, self.OutDir + '/PersonCorrelationMtx') logging.info('[%s] search result word vec analysis finished', self.QIn) return True