def setSearcher(self, wikiIndexDir, queryIndexDir): """ Setting the indexes to search for terms""" self.wIndex, self.wsearcher = loadIndex( wikiIndexDir, wikiIndexDir[wikiIndexDir.rfind('/') + 1:]) self.wtlc = loadCollector(self.wsearcher, 2000, 20) self.qIndex, self.qsearcher = loadIndex( queryIndexDir, queryIndexDir[queryIndexDir.rfind('/') + 1:]) self.qtlc = loadCollector(self.qsearcher, 2000, 20) self.qqp = loadQueryParser(self.qIndex, 'session') self.wqp = loadQueryParser(self.wIndex, 'content')
def SearchYahooQuestionsWithWhoosh(input_file, index_folder, index_name, questions_limit, output_file): # Open the index. questions_index, questions_searcher = loadIndex(index_folder, index_name) # Load the collector. questions_collector = loadCollector(questions_searcher, questions_limit, 20) # Search on question field for now. query_parser = loadQueryParser(questions_index, 'question_tokens') # Open the file to write the query and top questions_limit questions. out = open(output_file, 'w') for line in open(input_file, 'r'): query = line.strip() query_object = query_parser.parse(unicode(query)) try: questions_searcher.search_with_collector(query_object, questions_collector) except TimeLimit: print 'ERROR: Very Long query as input.', query results = questions_collector.results() for document in results: question_list = document['question_text'].split('\t'); for question in queston_list: out.write(query + '\t' + question + '\n') out.close()
def __init__(self, indexName, rnker, noTasks, wordList): self.ranker = rnker #load the index self.index, self.searcher = loadIndex(indexName, indexName[indexName.rfind('/') + 1:]) self.tlc = loadCollector(self.searcher, noTasks, 20) self.qp = loadQueryParser(self.index, 'task') self.porter = stem.porter.PorterStemmer() self.vocab = wordList
def __init__(self, fileName=None): self.vector = {} '''if not os.path.isdir(fileName): self.vector = {} self.loadVector(fileName) else: ''' if fileName: self.vIndex, self.vsearcher = loadIndex( fileName, fileName[fileName.rfind('/') + 1:]) self.vtlc = loadCollector(self.vsearcher, 1, 20) self.qqp = loadQueryParser(self.vIndex, 'term')
def getEntitiesWithEIndex(args): #oFile = open(args[3],'w') index, searcher = loadIndex(args[2], args[3]) tlc = loadCollector(searcher, 50, 20) qp = loadQueryParser(index) querySet = set() fileName = args[1] for query in getQuery(fileName, 1): if query not in querySet: print query, findTextEntities(query, searcher, tlc, qp) querySet.add(query)
def findMarkovStats(argv): i = 0 wikiIndexDir = argv[2] queryIndexDir = argv[3] iFile = argv[1] wIndex, wsearcher = loadIndex(wikiIndexDir, wikiIndexDir) qIndex, qsearcher = loadIndex(queryIndexDir, queryIndexDir) wtlc = loadCollector(wsearcher, 2000, 20) qtlc = loadCollector(qsearcher, 2000, 20) qqp = loadQueryParser(qIndex, 'session') wqp = loadQueryParser(wIndex, 'content') prec = {} recall = {} count = 0.0 for session in getSessionWithNL(iFile): #get the query query = session[0].lower() query = re.sub(SYMB, ' ', query) query = re.sub('\d+', ' ', query) query = re.sub('\s+', ' ', query).strip() aTerms, bTerms = addedAndRemovedTerms(query, session) if aTerms: count += 1.0 totalNetwork = {} #stemNetwork = {} #queryNetwork = {} #wikiNetwork = {} terms = updateNetwork(query, totalNetwork, wqp, wsearcher, wtlc, 'content', 'wiki') terms2 = updateNetwork(query, totalNetwork, qqp, qsearcher, qtlc, 'session', 'query') print len(terms), len(terms2) #updateStemNetwork(queryNetwork,stemNetwork, porter) #updateStemNetwork(wikiNetwork,stemNetwork, porter) updateStemNetwork(totalNetwork) #normalizeNetworks(queryNetwork) #normalizeNetworks(stemNetwork) #normalizeNetworks(wikiNetwork) #calculate the mixtures at two stages stage1 = {} stage2 = {} combineNetwork(1.0, stage1, totalNetwork, 'stem') combineNetwork(0.5, stage2, totalNetwork, 'query') combineNetwork(0.5, stage2, totalNetwork, 'wiki') #convert into matrix for multiplication totalDim = sorted(list(set(stage1.keys()) | set(stage2.keys()))) dim = len(totalDim) if dim > 0: stage1Matrix = toMatrix(totalDim, stage1) print 'STAGE1', stage1Matrix[0], stage1Matrix.shape stage2Matrix = toMatrix(totalDim, stage2) print 'STAGE2', stage2Matrix[0], stage2Matrix.shape backSmooth = 1.0 / len(totalDim) stage3Matrix = numpy.zeros((dim, dim)) stage3Matrix.fill(backSmooth) print 'STAGE3', stage3Matrix[0], stage3Matrix.shape alpha = 0.80 #matrix = ['stage2','stage2','stage2','stage2','stage2','stage2','stage2','stage2','stage3'] matrix = ['stage1', 'stage2', 'stage2', 'stage2', 'stage3'] totalSum = numpy.zeros((dim, dim)) cK = numpy.ones((dim, dim)) #start walk! for k in range(len(matrix)): print k, matrix[k] if matrix[k] == 'stage1': cK = numpy.dot(stage1Matrix, cK) elif matrix[k] == 'stage2': cK = numpy.dot(stage2Matrix, cK) else: cK = numpy.dot(cK, stage3Matrix) print 'CK', cK[0] totalSum = totalSum + (math.pow(alpha, k) * cK) totalSum = totalSum * (1 - alpha) #rank Terms qList = [] terms = query.split() #getQueryTerms(query) for term in terms: if term in totalDim: qList.append(totalDim.index(term)) else: print 'ERROR dint find ', query, '\t', term, len(totalDim) termScore = {} for i in range(len(totalDim)): termScore[totalDim[i]] = 0.0 for j in qList: if totalSum[i][j] > 0.0: termScore[totalDim[i]] += math.log(totalSum[i][j]) #find the precision for different term sets sortTerms = sorted(termScore.iteritems(), reverse=True, key=lambda x: x[1]) for i in [1, 3, 5, 10, 20, 30, 40, 50, 60, 100, '10000']: try: cTerms = set([x[0] for x in sortTerms[:i]]) print 'CTERMS ', sortTerms[0:10], len(cTerms), 'ATERMS', aTerms p = len(aTerms & cTerms) / (len(aTerms) * 1.0) r = len(aTerms & cTerms) / (len(cTerms) * 1.0) prec[i] = prec.setdefault(i, 0.0) + p recall[i] = recall.setdefault(i, 0.0) + r print 'Prec', i, '\t', query, '\t', p except Exception as err: cTerms = set([x[0] for x in sortTerms]) p = len(aTerms & cTerms) / (len(aTerms) * 1.0) r = len(aTerms & cTerms) / (len(cTerms) * 1.0) prec[i] = prec.setdefault(i, 0.0) + p recall[i] = recall.setdefault(i, 0.0) + r print 'Prec', i, '\t', query, '\t', p else: for i in [1, 3, 5, 10, 20, 30, 40, 50, 60, 100, '10000']: print 'Prec', i, '\t', query, '\t', 0.0 #average the prec & recall #print prec and recall print 'Printing Precison' for entry, value in prec.iteritems(): print entry, value / count print 'Printing Precison' for entry, value in recall.iteritems(): print entry, value / count wIndex.close() qIndex.close()