示例#1
0
文件: bow.py 项目: xhujerr/Sumid
class BOWBuilderSimple(AbstractProducer):
    """ Creates bow for each class. """

    def customInitialization(self):
        self.parser = self.kwargs["parserInstance"]
        self.initializeCounters()

    def initializeCounters(self):
        """ Initialization for BOWBuilderSimple is trivial (unlike BOWBuilderComplex)."""
        self.counters = CounterManager()
        self.counters.title = "word"  # 026 Title is used by FilesAdaptor to determine column.

    def consume(self):
        """ Consume just asks parser for next url. """
        url = self.parser.next()
        # 026 Just non-mandatory logging:
        self.cx += 1
        if url and self.cx % 10 == 0:
            self.logger.debug("Processing url number: %i. Last processed url: %s" % (self.cx, url.Composed))
        return url

    def verifySemiProduct(self, semiProduct):
        """ SemiProduct must be an SmartURL instance. """
        if isinstance(semiProduct, SmartURL):
            return True
        else:
            return False

    def produce(self, semiProduct):
        """ 
        Doesn't create product every iteration. 
        Just acumulates words until counters reaches number of settings.bowTempSize. 
        Then put old counters into output queue and starts counting from 0.
        """
        words = semiProduct.Words(self.settings.urlWordsSeparators)

        for word in words:
            if word not in self.settings.commonWords:
                self.counters.increment(word)
        if self.counters.NumberOfCounters > self.settings.bowTempSize or self.parser.linklistEOF:
            # 026 Put full counters to output queue.
            self.outputCondition.acquire()
            self.outputQueue.put(self.counters)
            self.outputCondition.release()
            # 026 Create new empty counters:
            self.counters = CounterManager()
            self.counters.title = "word"
            return True
        else:
            return False
        # 026 The meaning of the return value is, if a counter was added to the queue.

    def finalize(self):
        self.outputCondition.acquire()
        self.outputQueue.put(self.counters)
        self.outputCondition.release()
示例#2
0
 def test_updateBOW(self):
     """ When specific counter is sent to update first time, counter gets INSERTed to db. Second time is just updated """
     self.filesAdaptor.connectDB()
     counters=CounterManager()
     counters.title="word"
     testWords=[["word1", 1],["word2", 2],["word3", 3]]
     counters.increment(testWords[0][0], testWords[0][1])
     counters.increment(testWords[1][0], testWords[1][1])
     counters.increment(testWords[2][0], testWords[2][1])
     # First update and check of the counters.
     result=self.filesAdaptor.updateBOW(counters)    
     self.filesAdaptor.DBcursor.execute("select * from BOW order by word asc;")
     rowIndex=0
     for row in self.filesAdaptor.DBcursor:
         self.assertEqual(row[1],testWords[rowIndex][0])
         self.assertEqual(row[2],testWords[rowIndex][1])
         rowIndex+=1
     self.assertEqual(rowIndex,len(testWords))
     self.assertEqual(result,len(testWords))
     # Second update. The counters shall be doubled, while the count of them shall stay same.
     self.filesAdaptor.updateBOW(counters)    
     self.filesAdaptor.DBcursor.execute("select * from BOW order by word asc;")
     rowIndex=0
     for row in self.filesAdaptor.DBcursor:
         self.assertEqual(row[1],testWords[rowIndex][0])
         self.assertEqual(row[2],testWords[rowIndex][1]*2)
         rowIndex+=1
     self.assertEqual(rowIndex,len(testWords))
     self.assertEqual(result,len(testWords))
示例#3
0
文件: bow.py 项目: xhujerr/Sumid
    def produce(self, semiProduct):
        """ 
        Doesn't create product every iteration. 
        Just acumulates words until counters reaches number of settings.bowTempSize. 
        Then put old counters into output queue and starts counting from 0.
        """
        words = semiProduct.Words(self.settings.urlWordsSeparators)

        for word in words:
            if word not in self.settings.commonWords:
                self.counters.increment(word)
        if self.counters.NumberOfCounters > self.settings.bowTempSize or self.parser.linklistEOF:
            # 026 Put full counters to output queue.
            self.outputCondition.acquire()
            self.outputQueue.put(self.counters)
            self.outputCondition.release()
            # 026 Create new empty counters:
            self.counters = CounterManager()
            self.counters.title = "word"
            return True
        else:
            return False
示例#4
0
文件: bow.py 项目: xhujerr/Sumid
 def initializeCounters(self):
     """ Initialization for BOWBuilderSimple is trivial (unlike BOWBuilderComplex)."""
     self.counters = CounterManager()
     self.counters.title = "word"  # 026 Title is used by FilesAdaptor to determine column.