示例#1
0
def getDocuments(ism, isle, storageId, locationId):
    iss = IndexSearchConstraint(None,None)
    iss.constrainByIsland(isle)
    iss.constrainByStorageIds([Long(storageId)])
    iss.setShardConstraint(locationId)
    iss.setOutputFields(['*'])
    qr = ism.search('',iss, None, CallerApp.INTERNAL)
    results = []
    for doc in qr:
        results.append(doc)
    return results
示例#2
0
def migrate(custIds,islandId,sourceIslandId,minRange,maxRange,showAndExitOnly=False):
    global QUEUE,MsgsProcessed,StartCount,TotSearchTime
    mc = ManagementContainer.getInstance()
    ism = mc.getIndexSearchManager()
    im = mc.getIslandManager()
    sourceIsland = im.getIsland(int(sourceIslandId))
    sourceType = sourceIsland.getIndexPlatformVersion()

    # search for documents 
    isc = IndexSearchConstraint(None,None)
    isc.sortBy('storageid',True)
    isc.constrainByMinimumStorageId(minRange)
    isc.constrainByIsland(mc.getIslandManager().getIsland(sourceIslandId))

    if IndexPlatformVersion.SOLR_x == sourceType :
        query = buildSolrQuery(custIds, maxRange)
    else:
        query = buildFastQuery(custIds, maxRange)
    start = time.time()
    sr = search(query,isc,ism)
    StartCount = sr.getDocCount()
    elapsed = time.time() - start
    TotSearchTime = TotSearchTime + elapsed
    log('PERF: search',StartCount,'msgs, time',elapsed,'s','total',TotSearchTime,'s')
    if showAndExitOnly:
        return
    hitsRemaining = ism.getMaxIndextHits()
    lastStorageId = 0
    iter = sr.documents().iterator()
    
    while checkSearchIter(iter):
        doc = iter.next()
        hitsRemaining -= 1

        storageId   = doc.getStorageID()

        # only process each EMS message ID one time
        if storageId != lastStorageId:
            QUEUE.put(doc)
            lastStorageId = storageId

        if hitsRemaining <= 0 or not checkSearchIter(iter):
            log('STAT: getting new search results beyond storageId',storageId)
            isc.constrainByMinimumStorageId(storageId)
            start = time.time()
            sr = search(query + ' and isattachment:0',isc,ism)
            elapsed = time.time() - start
            TotSearchTime = TotSearchTime + elapsed
            log('PERF: search time',elapsed,'s','total',TotSearchTime,'s')
            hitsRemaining = ism.getMaxIndextHits()
            iter = sr.documents().iterator()

    log('STAT: search work complete')
示例#3
0
def test(islandId, numMessages, esHosts):
    mc = ManagementContainer.getInstance()
    lastVal = None
    custId = None
    service = Service("storageimporter")
    msgs = None

    try:
        lastVal, nextVal = setMessageId(mc, Integer.MAX_VALUE - 2)
        print "lastVal,nextVal", lastVal, nextVal
        print "restart storageimporters", esHosts
        service.invoke("restart", esHosts)
        custId = setupCustomer(mc, islandId)
        msgs = findMessages(mc, custId, numMessages)
        print "created messages", msgs
        checkSearchStatus(mc, msgs, custId)

        ism = mc.getIndexSearchManager()
        isc = IndexSearchConstraint(None, None)
        isc.sortBy("storageid", True)
        isc.constrainByMinimumStorageId(0)
        isc.constrainByIsland(mc.getIslandManager().getIsland(islandId))
        print "DBG min", 0
        query = "customerid:" + str(custId)
        sr = ism.searchAndWrap(query, isc, CallerApp.INTERNAL)
        if sr.getDocCount() != 10:
            print >> sys.stderr, "Failed to find 10 documents when minStorageId = 0, found", sr.getDocCount()
            return 1
        print "DBG min", Integer.MAX_VALUE - 1
        isc.constrainByMinimumStorageId(Integer.MAX_VALUE - 1)
        sr = ism.searchAndWrap(query, isc, CallerApp.INTERNAL)
        if sr.getDocCount() != 9:
            print >> sys.stderr, "Failed to find 9 documents when minStorageId = ", Integer.MAX_VALUE - 1, ",found", sr.getDocCount()
            return 1
        print "DBG min", Integer.MAX_VALUE + 1
        isc.constrainByMinimumStorageId(Integer.MAX_VALUE + 1)
        sr = ism.searchAndWrap(query, isc, CallerApp.INTERNAL)
        if sr.getDocCount() != 7:
            print >> sys.stderr, "Failed to find 7 documents when minStorageId = ", Integer.MAX_VALUE + 1, ",found", sr.getDocCount()
            return 1
    finally:
        if msgs is not None:
            for msg in msgs:
                purgeMessage(mc, msg, custId)
        if custId is not None:
            mc.getCustomerManager().deleteCustomers([custId])
        if lastVal is not None:
            lastVal, nextVal = setMessageId(mc, lastVal)
            print "lastVal,nextVal", lastVal, nextVal
            print "restart storageimporters", esHosts
            service.invoke("restart", esHosts)
    return 0
示例#4
0
def performSampleAudit(loc,samplePercent,seed):
    global BATCH_SAMPLE_PCT
    mc = ManagementContainer.getInstance()
    cm = mc.getClusterManager()
    im = mc.getIslandManager()
    cluster = cm.getCluster(loc.getClusterId())
    island = im.getIsland(cluster.getIslandId())

    masterURL = URL(loc.getClusterLocationProperty(SolrClusterAdapter.SOLR_MASTER_HOST_URL_PROP))
    slaveURL = URL(loc.getClusterLocationProperty(SolrClusterAdapter.SOLR_SLAVE_HOST_URL_PROP))

    ssmMaster = MySolrSearchManager(mc.getConfiguration(),'solrconfig.xml')
    ssmMaster.setURL(masterURL)
    ssmSlave = MySolrSearchManager(mc.getConfiguration(),'solrconfig.xml')
    ssmSlave.setURL(slaveURL)

    sm = mc.getIndexSearchManager()
    # note that this will be slow -- especially if samplePercent is large and data set size is large

    srcIS = IndexSearchConstraint(None,None)
    srcIS.constrainByNumberOfHitsToReturn(1)
    srcIS.constrainByIsland(island)

    destIS = IndexSearchConstraint(None,None)
    destIS.constrainByNumberOfHitsToReturn(1)
    destIS.constrainByIsland(island)

    # get a message count for the customer
    srcQR = search(MainMessageQuery(),srcIS,ssmSlave)
    srcDocCount = srcQR.getDocCount()
    print 'total messages',srcDocCount,'msgs'
    msgsToSample = int(srcDocCount * samplePercent)
   
    if msgsToSample < 100:
        chunkSize = 10
    elif msgsToSample < 5000:
        chunkSize = 100
    else:
        chunkSize = 1000
    print 'Source sample size',msgsToSample,'msgs'

    # ensure 10% on selections per sample
    if samplePercent < 0.10:
        samplePercent = 0.10
    # if corpus is large, sample more per chunk
    if chunkSize >= 1000 and msgsToSample > 50000:
        samplePercent = 0.5

    # perform sample audit
    sampleCountsFromSource(ssmSlave,ssmMaster,srcIS,destIS,samplePercent,msgsToSample,seed,chunkSize)
示例#5
0
def performQuickAudit(location):
    mc = ManagementContainer.getInstance()
    clus = mc.getClusterManager().getCluster(location.getClusterId())
    isle = mc.getIslandManager().getIsland(clus.getIslandId())

    masterURL = URL(location.getClusterLocationProperty(SolrClusterAdapter.SOLR_MASTER_HOST_URL_PROP))
    slaveURL = URL(location.getClusterLocationProperty(SolrClusterAdapter.SOLR_SLAVE_HOST_URL_PROP))

    ssmMaster = MySolrSearchManager(mc.getConfiguration(),'solrconfig.xml')
    ssmMaster.setURL(masterURL)
    ssmSlave = MySolrSearchManager(mc.getConfiguration(),'solrconfig.xml')
    ssmSlave.setURL(slaveURL)

    srcIS = IndexSearchConstraint(None,None)
    srcIS.constrainByIsland(isle)
    srcIS.constrainByNumberOfHitsToReturn(1)

    destIS = IndexSearchConstraint(None,None)
    destIS.constrainByIsland(isle)
    destIS.constrainByNumberOfHitsToReturn(1)

    srcQR = search('',srcIS,ssmSlave)
    destQR = search('',destIS,ssmMaster)

    srcDocCount = srcQR.getDocCount()
    destDocCount = destQR.getDocCount()

    srcQR = search(MainMessageQuery(),srcIS,ssmSlave)
    destQR = search(MainMessageQuery(),destIS,ssmMaster)

    srcMessages = srcQR.getDocCount()
    destMessages = destQR.getDocCount()

    srcQR = search(AttachmentQuery(),srcIS,ssmSlave)
    destQR = search(AttachmentQuery(),destIS,ssmMaster)

    srcAttachments = srcQR.getDocCount()
    destAttachments = destQR.getDocCount()

    srcQR = search('-isattachment:*',srcIS,ssmSlave)
    destQR = search('-isattachment:*',destIS,ssmMaster)

    srcNoAttachments = srcQR.getDocCount()
    destNoAttachments = destQR.getDocCount()

    print 'SOURCE',slaveURL,'DEST',masterURL
    print 'SOURCE Documents:',srcDocCount,'DEST Documents:',destDocCount
    print 'SOURCE Messages:',srcMessages,'DEST Messages:',destMessages
    print 'SOURCE Attachments:',srcAttachments,'DEST Attachments:',destAttachments
    print 'SOURCE (test data):',srcNoAttachments,'DEST (test data):',destNoAttachments
示例#6
0
def analyze(islandId,timeBoundary,commit,type):

    mc = ManagementContainer.getInstance()
    im = mc.getIslandManager()
    isle = im.getIsland(islandId)
    fIS = IndexSearchConstraint(None,None)
    fIS.constrainByIsland(isle)
    fIS.constrainByType(type)
    ism = mc.getIndexSearchManager()
    qb = MyCriteria("isattachment:0 AND processingtime:[%s TO *]" % timeBoundary)
    qr = ism.scaledSearch(qb,fIS,CallerApp.INTERNAL)
    for doc in qr:
       pass
    dupsById = qr.getDuplicates()   

    print time.asctime(),'Found',dupsById.size(),type,'duplicates in island',islandId,'total docs',qr.getDocCount()

    for me in dupsById.entrySet():
       storageId = me.getKey()
       locIds = me.getValue()
       data = {}
       for locId in locIds:
           docs = getDocuments(ism,isle,storageId,locId)
           byId = {}
           for doc in docs:
               byId[doc.getString(IIndexSearchSchema.FIELD_CONTENT_ID)] = doc
           data[locId] = byId
       try:
           dupData = []
           if commit is True:
               (found,dupData) = simpleValidateData(data)
               if not found:
                   print time.asctime(),'The DB has no matching record for any of the documents found in the archive. Skipping',storageId
                   continue
           else:
               dupData = validateData(data)

           for storageId,locId in dupData:
               print time.asctime(),'Deleting documents for',storageId,'on location',locId
               deleteDuplicate(locId,storageId,commit,type)
          
       except Exception,e:
           print time.asctime(),'Not recommending any change for storage ID',storageId,'due to validation value',e
示例#7
0
def performSampleAudit(custIds,samplePercent,seed):
    global BATCH_SAMPLE_PCT
    mc = ManagementContainer.getInstance()
    cm = mc.getCustomerManager()
    sm = mc.getIndexSearchManager()
    # note that this will be slow -- especially if samplePercent is large and data set size is large
    for custId in custIds:
        cust = cm.getCustomer(custId)

        srcIS = IndexSearchConstraint(None,None)
        srcIS.constrainByIsland(cust.getOldFeedIsland())
        srcIS.constrainByNumberOfHitsToReturn(1)
        srcIS.constrainByCustomerId(custId)

        destIS = IndexSearchConstraint(None,None)
        destIS.constrainByIsland(cust.getFeedIsland())
        destIS.constrainByNumberOfHitsToReturn(1)
        destIS.constrainByCustomerId(custId)

        # get a message count for the customer
        srcQR = searchAndWrap(MainMessageQuery(),srcIS,sm)
        srcDocCount = srcQR.getDocCount()
        print 'Customer',custId,'total messages',srcDocCount,'msgs'
        msgsToSample = int(srcDocCount * samplePercent)
   
        if msgsToSample < 100:
            chunkSize = 10
        elif msgsToSample < 5000:
            chunkSize = 100
        else:
            chunkSize = 1000
        print 'Customer',custId,'Source sample size',msgsToSample,'msgs'

        # ensure 10% on selections per sample
        if samplePercent < 0.10:
            samplePercent = 0.10
        # if corpus is large, sample more per chunk
        if chunkSize >= 1000 and msgsToSample > 50000:
            samplePercent = 0.5

        # perform sample audit
        sampleCountsFromSource(sm,srcIS,destIS,samplePercent,msgsToSample,seed,chunkSize)
示例#8
0
def search(isle, term):
    mc = ManagementContainer.getInstance()
    sm = mc.getIndexSearchManager()
    im = mc.getIslandManager()
    pm = mc.getPartitionManager()

    fIS = IndexSearchConstraint(None, None)
    fIS.constrainByIsland(isle)
    fIS.constrainByNumberOfHitsToReturn(10)
    fIS.constrainByMinimumStorageId(0)
    fIS.setOutputFields(["storageid", "partitionid"])
    fIS.sortBy("storageid", True)

    done = False
    cnt = 0
    lastID = 0
    while not done:
        ok = False
        while not ok:
            try:
                fQR = sm.search(term, fIS, None, CallerApp.INTERNAL)
                ok = True
            except Throwable, t:
                print "Exception caught during search, retry = true", t
                t.printStackTrace()

        numDocs = fQR.getDocCount()
        print "Found numDocs", numDocs

        done = fQR.getDocCount() == 0

        print "Preview some data"
        for doc in fQR.documents():
            # print doc.getPartitionID(),pm.getPartition(doc.getPartitionID()).isReadOnly(),doc.getStorageID(),doc.getReceivedDate()
            print doc.getPartitionID(), doc.getStorageID()
            lastID = doc.getStorageID()
            cnt += 1
        fIS.constrainByMinimumStorageId(lastID)

        # just loop once
        done = True
示例#9
0
def performQuickAudit(custIds):
    mc = ManagementContainer.getInstance()
    cm = mc.getCustomerManager()
    sm = mc.getIndexSearchManager()
    for custId in custIds:
        cust = cm.getCustomer(custId)

        srcIS = IndexSearchConstraint(None,None)
        srcIS.constrainByIsland(cust.getOldFeedIsland())
        srcIS.constrainByNumberOfHitsToReturn(1)
        srcIS.constrainByCustomerId(custId)

        destIS = IndexSearchConstraint(None,None)
        destIS.constrainByIsland(cust.getFeedIsland())
        destIS.constrainByNumberOfHitsToReturn(1)
        destIS.constrainByCustomerId(custId)

        srcQR = searchAndWrap('',srcIS,sm)
        destQR = search('',destIS,sm)

        srcDocCount = srcQR.getDocCount()
        destDocCount = destQR.getDocCount()

        srcQR = searchAndWrap(MainMessageQuery(),srcIS,sm)
        destQR = search(MainMessageQuery(),destIS,sm)

        srcMessages = srcQR.getDocCount()
        destMessages = destQR.getDocCount()

        srcQR = searchAndWrap(AttachmentQuery(),srcIS,sm)
        destQR = search(AttachmentQuery(),destIS,sm)

        srcAttachments = srcQR.getDocCount()
        destAttachments = destQR.getDocCount()

        print 'Customer',custId,'SOURCE Documents:',srcDocCount,'DEST Documents:',destDocCount
        print 'Customer',custId,'SOURCE Messages:',srcMessages,'DEST Messages:',destMessages
        print 'Customer',custId,'SOURCE Attachments:',srcAttachments,'DEST Attachments:',destAttachments
示例#10
0
def test(islandId,custName,numMessages):
    mc = ManagementContainer.getInstance()
    
    custid = findCustomer(custName)    
    if custid < 0:
        print 'test failed because customer',custName,'was not found'
        return 1
        
    island = mc.getIslandManager().getIsland(islandId)
    
    try:
        msgs = findMessages(mc,custid,numMessages)
    
        if msgs.size() < numMessages:
            print 'Fail, Did not find all messages stored, only found', msgs.size()
            return 1
    
        if not checkSearchStatus(mc,msgs,custid):
            print 'Fail, could not find all messages in search index'
            return 1

        ism = mc.getIndexSearchManager()
        isc = IndexSearchConstraint(custid,None)

        isc.constrainByNumberOfHitsToReturn(1)
        isc.constrainByIsland(island)
        isc.constrainByOffset(0)
        try:
            results = ism.resolveLocations(isc)
            print 'Fail, expected IndexSearchException'
            return 1
        except IndexSearchException, e:
            print 'Pass, IndexSearchException',e
        except:
            print 'Expected IndexSearchException, but got',sys.exc_info(),traceback.print_exc(file=sys.stderr)
            return 1

        isc.constrainByNumberOfHitsToReturn(2)
        isc.constrainByOffset(0)
        try:
            results = ism.resolveLocations(isc)
            sz = getResultSize(results)
            print 'Pass, hits=2',sz
          
            if sz < numMessages:
                print 'Wrong number of results. Expected >=',numMessages,'Got',sz
        except:
            print 'Unexpected exception caught when hits was set to 2',sys.exc_info(),traceback.print_exc(file=sys.stderr)
            return 1

        isc.constrainByNumberOfHitsToReturn(numMessages)
        isc.constrainByOffset(0)
        try:
            results = ism.resolveLocations(isc)
            sz = getResultSize(results)
            print 'Pass, hits=',numMessages,sz
            if sz < numMessages:
                print 'Wrong number of results. Expected >=',numMessages,'Got',sz,results.getResults()
        except:
            print 'Unexpected exception caught when hits was set to',numMessages,sys.exc_info(),traceback.print_exc(file=sys.stderr)
            return 1