def getDocuments(ism, isle, storageId, locationId): iss = IndexSearchConstraint(None,None) iss.constrainByIsland(isle) iss.constrainByStorageIds([Long(storageId)]) iss.setShardConstraint(locationId) iss.setOutputFields(['*']) qr = ism.search('',iss, None, CallerApp.INTERNAL) results = [] for doc in qr: results.append(doc) return results
def migrate(custIds,islandId,sourceIslandId,minRange,maxRange,showAndExitOnly=False): global QUEUE,MsgsProcessed,StartCount,TotSearchTime mc = ManagementContainer.getInstance() ism = mc.getIndexSearchManager() im = mc.getIslandManager() sourceIsland = im.getIsland(int(sourceIslandId)) sourceType = sourceIsland.getIndexPlatformVersion() # search for documents isc = IndexSearchConstraint(None,None) isc.sortBy('storageid',True) isc.constrainByMinimumStorageId(minRange) isc.constrainByIsland(mc.getIslandManager().getIsland(sourceIslandId)) if IndexPlatformVersion.SOLR_x == sourceType : query = buildSolrQuery(custIds, maxRange) else: query = buildFastQuery(custIds, maxRange) start = time.time() sr = search(query,isc,ism) StartCount = sr.getDocCount() elapsed = time.time() - start TotSearchTime = TotSearchTime + elapsed log('PERF: search',StartCount,'msgs, time',elapsed,'s','total',TotSearchTime,'s') if showAndExitOnly: return hitsRemaining = ism.getMaxIndextHits() lastStorageId = 0 iter = sr.documents().iterator() while checkSearchIter(iter): doc = iter.next() hitsRemaining -= 1 storageId = doc.getStorageID() # only process each EMS message ID one time if storageId != lastStorageId: QUEUE.put(doc) lastStorageId = storageId if hitsRemaining <= 0 or not checkSearchIter(iter): log('STAT: getting new search results beyond storageId',storageId) isc.constrainByMinimumStorageId(storageId) start = time.time() sr = search(query + ' and isattachment:0',isc,ism) elapsed = time.time() - start TotSearchTime = TotSearchTime + elapsed log('PERF: search time',elapsed,'s','total',TotSearchTime,'s') hitsRemaining = ism.getMaxIndextHits() iter = sr.documents().iterator() log('STAT: search work complete')
def test(islandId, numMessages, esHosts): mc = ManagementContainer.getInstance() lastVal = None custId = None service = Service("storageimporter") msgs = None try: lastVal, nextVal = setMessageId(mc, Integer.MAX_VALUE - 2) print "lastVal,nextVal", lastVal, nextVal print "restart storageimporters", esHosts service.invoke("restart", esHosts) custId = setupCustomer(mc, islandId) msgs = findMessages(mc, custId, numMessages) print "created messages", msgs checkSearchStatus(mc, msgs, custId) ism = mc.getIndexSearchManager() isc = IndexSearchConstraint(None, None) isc.sortBy("storageid", True) isc.constrainByMinimumStorageId(0) isc.constrainByIsland(mc.getIslandManager().getIsland(islandId)) print "DBG min", 0 query = "customerid:" + str(custId) sr = ism.searchAndWrap(query, isc, CallerApp.INTERNAL) if sr.getDocCount() != 10: print >> sys.stderr, "Failed to find 10 documents when minStorageId = 0, found", sr.getDocCount() return 1 print "DBG min", Integer.MAX_VALUE - 1 isc.constrainByMinimumStorageId(Integer.MAX_VALUE - 1) sr = ism.searchAndWrap(query, isc, CallerApp.INTERNAL) if sr.getDocCount() != 9: print >> sys.stderr, "Failed to find 9 documents when minStorageId = ", Integer.MAX_VALUE - 1, ",found", sr.getDocCount() return 1 print "DBG min", Integer.MAX_VALUE + 1 isc.constrainByMinimumStorageId(Integer.MAX_VALUE + 1) sr = ism.searchAndWrap(query, isc, CallerApp.INTERNAL) if sr.getDocCount() != 7: print >> sys.stderr, "Failed to find 7 documents when minStorageId = ", Integer.MAX_VALUE + 1, ",found", sr.getDocCount() return 1 finally: if msgs is not None: for msg in msgs: purgeMessage(mc, msg, custId) if custId is not None: mc.getCustomerManager().deleteCustomers([custId]) if lastVal is not None: lastVal, nextVal = setMessageId(mc, lastVal) print "lastVal,nextVal", lastVal, nextVal print "restart storageimporters", esHosts service.invoke("restart", esHosts) return 0
def performSampleAudit(loc,samplePercent,seed): global BATCH_SAMPLE_PCT mc = ManagementContainer.getInstance() cm = mc.getClusterManager() im = mc.getIslandManager() cluster = cm.getCluster(loc.getClusterId()) island = im.getIsland(cluster.getIslandId()) masterURL = URL(loc.getClusterLocationProperty(SolrClusterAdapter.SOLR_MASTER_HOST_URL_PROP)) slaveURL = URL(loc.getClusterLocationProperty(SolrClusterAdapter.SOLR_SLAVE_HOST_URL_PROP)) ssmMaster = MySolrSearchManager(mc.getConfiguration(),'solrconfig.xml') ssmMaster.setURL(masterURL) ssmSlave = MySolrSearchManager(mc.getConfiguration(),'solrconfig.xml') ssmSlave.setURL(slaveURL) sm = mc.getIndexSearchManager() # note that this will be slow -- especially if samplePercent is large and data set size is large srcIS = IndexSearchConstraint(None,None) srcIS.constrainByNumberOfHitsToReturn(1) srcIS.constrainByIsland(island) destIS = IndexSearchConstraint(None,None) destIS.constrainByNumberOfHitsToReturn(1) destIS.constrainByIsland(island) # get a message count for the customer srcQR = search(MainMessageQuery(),srcIS,ssmSlave) srcDocCount = srcQR.getDocCount() print 'total messages',srcDocCount,'msgs' msgsToSample = int(srcDocCount * samplePercent) if msgsToSample < 100: chunkSize = 10 elif msgsToSample < 5000: chunkSize = 100 else: chunkSize = 1000 print 'Source sample size',msgsToSample,'msgs' # ensure 10% on selections per sample if samplePercent < 0.10: samplePercent = 0.10 # if corpus is large, sample more per chunk if chunkSize >= 1000 and msgsToSample > 50000: samplePercent = 0.5 # perform sample audit sampleCountsFromSource(ssmSlave,ssmMaster,srcIS,destIS,samplePercent,msgsToSample,seed,chunkSize)
def performQuickAudit(location): mc = ManagementContainer.getInstance() clus = mc.getClusterManager().getCluster(location.getClusterId()) isle = mc.getIslandManager().getIsland(clus.getIslandId()) masterURL = URL(location.getClusterLocationProperty(SolrClusterAdapter.SOLR_MASTER_HOST_URL_PROP)) slaveURL = URL(location.getClusterLocationProperty(SolrClusterAdapter.SOLR_SLAVE_HOST_URL_PROP)) ssmMaster = MySolrSearchManager(mc.getConfiguration(),'solrconfig.xml') ssmMaster.setURL(masterURL) ssmSlave = MySolrSearchManager(mc.getConfiguration(),'solrconfig.xml') ssmSlave.setURL(slaveURL) srcIS = IndexSearchConstraint(None,None) srcIS.constrainByIsland(isle) srcIS.constrainByNumberOfHitsToReturn(1) destIS = IndexSearchConstraint(None,None) destIS.constrainByIsland(isle) destIS.constrainByNumberOfHitsToReturn(1) srcQR = search('',srcIS,ssmSlave) destQR = search('',destIS,ssmMaster) srcDocCount = srcQR.getDocCount() destDocCount = destQR.getDocCount() srcQR = search(MainMessageQuery(),srcIS,ssmSlave) destQR = search(MainMessageQuery(),destIS,ssmMaster) srcMessages = srcQR.getDocCount() destMessages = destQR.getDocCount() srcQR = search(AttachmentQuery(),srcIS,ssmSlave) destQR = search(AttachmentQuery(),destIS,ssmMaster) srcAttachments = srcQR.getDocCount() destAttachments = destQR.getDocCount() srcQR = search('-isattachment:*',srcIS,ssmSlave) destQR = search('-isattachment:*',destIS,ssmMaster) srcNoAttachments = srcQR.getDocCount() destNoAttachments = destQR.getDocCount() print 'SOURCE',slaveURL,'DEST',masterURL print 'SOURCE Documents:',srcDocCount,'DEST Documents:',destDocCount print 'SOURCE Messages:',srcMessages,'DEST Messages:',destMessages print 'SOURCE Attachments:',srcAttachments,'DEST Attachments:',destAttachments print 'SOURCE (test data):',srcNoAttachments,'DEST (test data):',destNoAttachments
def analyze(islandId,timeBoundary,commit,type): mc = ManagementContainer.getInstance() im = mc.getIslandManager() isle = im.getIsland(islandId) fIS = IndexSearchConstraint(None,None) fIS.constrainByIsland(isle) fIS.constrainByType(type) ism = mc.getIndexSearchManager() qb = MyCriteria("isattachment:0 AND processingtime:[%s TO *]" % timeBoundary) qr = ism.scaledSearch(qb,fIS,CallerApp.INTERNAL) for doc in qr: pass dupsById = qr.getDuplicates() print time.asctime(),'Found',dupsById.size(),type,'duplicates in island',islandId,'total docs',qr.getDocCount() for me in dupsById.entrySet(): storageId = me.getKey() locIds = me.getValue() data = {} for locId in locIds: docs = getDocuments(ism,isle,storageId,locId) byId = {} for doc in docs: byId[doc.getString(IIndexSearchSchema.FIELD_CONTENT_ID)] = doc data[locId] = byId try: dupData = [] if commit is True: (found,dupData) = simpleValidateData(data) if not found: print time.asctime(),'The DB has no matching record for any of the documents found in the archive. Skipping',storageId continue else: dupData = validateData(data) for storageId,locId in dupData: print time.asctime(),'Deleting documents for',storageId,'on location',locId deleteDuplicate(locId,storageId,commit,type) except Exception,e: print time.asctime(),'Not recommending any change for storage ID',storageId,'due to validation value',e
def performSampleAudit(custIds,samplePercent,seed): global BATCH_SAMPLE_PCT mc = ManagementContainer.getInstance() cm = mc.getCustomerManager() sm = mc.getIndexSearchManager() # note that this will be slow -- especially if samplePercent is large and data set size is large for custId in custIds: cust = cm.getCustomer(custId) srcIS = IndexSearchConstraint(None,None) srcIS.constrainByIsland(cust.getOldFeedIsland()) srcIS.constrainByNumberOfHitsToReturn(1) srcIS.constrainByCustomerId(custId) destIS = IndexSearchConstraint(None,None) destIS.constrainByIsland(cust.getFeedIsland()) destIS.constrainByNumberOfHitsToReturn(1) destIS.constrainByCustomerId(custId) # get a message count for the customer srcQR = searchAndWrap(MainMessageQuery(),srcIS,sm) srcDocCount = srcQR.getDocCount() print 'Customer',custId,'total messages',srcDocCount,'msgs' msgsToSample = int(srcDocCount * samplePercent) if msgsToSample < 100: chunkSize = 10 elif msgsToSample < 5000: chunkSize = 100 else: chunkSize = 1000 print 'Customer',custId,'Source sample size',msgsToSample,'msgs' # ensure 10% on selections per sample if samplePercent < 0.10: samplePercent = 0.10 # if corpus is large, sample more per chunk if chunkSize >= 1000 and msgsToSample > 50000: samplePercent = 0.5 # perform sample audit sampleCountsFromSource(sm,srcIS,destIS,samplePercent,msgsToSample,seed,chunkSize)
def search(isle, term): mc = ManagementContainer.getInstance() sm = mc.getIndexSearchManager() im = mc.getIslandManager() pm = mc.getPartitionManager() fIS = IndexSearchConstraint(None, None) fIS.constrainByIsland(isle) fIS.constrainByNumberOfHitsToReturn(10) fIS.constrainByMinimumStorageId(0) fIS.setOutputFields(["storageid", "partitionid"]) fIS.sortBy("storageid", True) done = False cnt = 0 lastID = 0 while not done: ok = False while not ok: try: fQR = sm.search(term, fIS, None, CallerApp.INTERNAL) ok = True except Throwable, t: print "Exception caught during search, retry = true", t t.printStackTrace() numDocs = fQR.getDocCount() print "Found numDocs", numDocs done = fQR.getDocCount() == 0 print "Preview some data" for doc in fQR.documents(): # print doc.getPartitionID(),pm.getPartition(doc.getPartitionID()).isReadOnly(),doc.getStorageID(),doc.getReceivedDate() print doc.getPartitionID(), doc.getStorageID() lastID = doc.getStorageID() cnt += 1 fIS.constrainByMinimumStorageId(lastID) # just loop once done = True
def performQuickAudit(custIds): mc = ManagementContainer.getInstance() cm = mc.getCustomerManager() sm = mc.getIndexSearchManager() for custId in custIds: cust = cm.getCustomer(custId) srcIS = IndexSearchConstraint(None,None) srcIS.constrainByIsland(cust.getOldFeedIsland()) srcIS.constrainByNumberOfHitsToReturn(1) srcIS.constrainByCustomerId(custId) destIS = IndexSearchConstraint(None,None) destIS.constrainByIsland(cust.getFeedIsland()) destIS.constrainByNumberOfHitsToReturn(1) destIS.constrainByCustomerId(custId) srcQR = searchAndWrap('',srcIS,sm) destQR = search('',destIS,sm) srcDocCount = srcQR.getDocCount() destDocCount = destQR.getDocCount() srcQR = searchAndWrap(MainMessageQuery(),srcIS,sm) destQR = search(MainMessageQuery(),destIS,sm) srcMessages = srcQR.getDocCount() destMessages = destQR.getDocCount() srcQR = searchAndWrap(AttachmentQuery(),srcIS,sm) destQR = search(AttachmentQuery(),destIS,sm) srcAttachments = srcQR.getDocCount() destAttachments = destQR.getDocCount() print 'Customer',custId,'SOURCE Documents:',srcDocCount,'DEST Documents:',destDocCount print 'Customer',custId,'SOURCE Messages:',srcMessages,'DEST Messages:',destMessages print 'Customer',custId,'SOURCE Attachments:',srcAttachments,'DEST Attachments:',destAttachments
def test(islandId,custName,numMessages): mc = ManagementContainer.getInstance() custid = findCustomer(custName) if custid < 0: print 'test failed because customer',custName,'was not found' return 1 island = mc.getIslandManager().getIsland(islandId) try: msgs = findMessages(mc,custid,numMessages) if msgs.size() < numMessages: print 'Fail, Did not find all messages stored, only found', msgs.size() return 1 if not checkSearchStatus(mc,msgs,custid): print 'Fail, could not find all messages in search index' return 1 ism = mc.getIndexSearchManager() isc = IndexSearchConstraint(custid,None) isc.constrainByNumberOfHitsToReturn(1) isc.constrainByIsland(island) isc.constrainByOffset(0) try: results = ism.resolveLocations(isc) print 'Fail, expected IndexSearchException' return 1 except IndexSearchException, e: print 'Pass, IndexSearchException',e except: print 'Expected IndexSearchException, but got',sys.exc_info(),traceback.print_exc(file=sys.stderr) return 1 isc.constrainByNumberOfHitsToReturn(2) isc.constrainByOffset(0) try: results = ism.resolveLocations(isc) sz = getResultSize(results) print 'Pass, hits=2',sz if sz < numMessages: print 'Wrong number of results. Expected >=',numMessages,'Got',sz except: print 'Unexpected exception caught when hits was set to 2',sys.exc_info(),traceback.print_exc(file=sys.stderr) return 1 isc.constrainByNumberOfHitsToReturn(numMessages) isc.constrainByOffset(0) try: results = ism.resolveLocations(isc) sz = getResultSize(results) print 'Pass, hits=',numMessages,sz if sz < numMessages: print 'Wrong number of results. Expected >=',numMessages,'Got',sz,results.getResults() except: print 'Unexpected exception caught when hits was set to',numMessages,sys.exc_info(),traceback.print_exc(file=sys.stderr) return 1