Пример #1
0
 def StartCrawler(self):
     self.SetButtonsState(Started = True)
     for url in self.Urls:
         if url.ID in self.CurrentCrawlingUrls: #prevent duplicate url browsing
             continue
         self.CurrentCrawlingUrls.append(url.ID)
         self.EngineThreads.append(Engine(Url = url , 
                            PatternsList = self.Patterns , 
                            RepLiter = self.Project.RepLiter,
                            Parent = self,
                            DBEngine = self.Project.DBEngine ,
                            Connection = self.DatabaseConnection ,
                            Cursor = self.Cursor ,
                            AllToGether = self.AllPatternsChk.isChecked()
                            ))
         #initialize and start thread
         self.EngineThreads[-1].start()
         
     self.IsCrawling = True
     if not self.ElapsedTimerStarted:
         self.ElapsedTimerStarted = True
         if hasattr(self, "ElapsedTimer"):
             self.ElapsedTimer = Timer(self , Hour = self.ElapsedTimer.Hour , 
                                       Minute = self.ElapsedTimer.Minute , 
                                       Second = self.ElapsedTimer.Second)
         else:
             self.ElapsedTimer = Timer(self , self.ElapedHour_Project , 
                                       self.ElapedMinute_Project ,
                                        self.ElapedSeconds_Project)
         self.ElapsedTimer.start()
Пример #2
0
class PoriaSimpleCrawlerMain(QtGui.QMainWindow , Ui_PoriaSimpleCrawler):
    def ShowNewProjectWizard(self):
        self.NewProjectWizard = PoriaCrawlerNewProjectWizard(self) #instantiate new proj wizard
        self.NewProjectWizard.show()
        
    
    def ShowOpenFormDialog(self): #this method shows open dialog for selecting project file
        ProjectFilePath = QtGui.QFileDialog.getOpenFileNameAndFilter(parent=self, caption=u"Open Project" , filter = "Poria's Simple Crawler | (*.psc)")[0]
        if ProjectFilePath:
            try:
                self.Project = ProjectClass.LoadFromFile(ProjectFilePath)#loading project file to self.Project
            except InvalidProjectFile:# if file is not vaild and exception will raises
                QtGui.QMessageBox.critical(self , ProjectTitle , u"Project file is not valid!")
                return
            self.LoadProject()#establishing db connection and load project attributes 
    
    
    
    def LoadPatterns(self):
        self.Patterns =  FetchPatterns.ConvertFromDatabaseFetchList(FetchPatterns.FetchDBPatterns(self.Cursor)) #fetching patterns from db
        self.PatternsTree.clear() #clearing patterns tree
        Counter = 1
        for Item in self.Patterns: #adding patterns to tree
            Item = QtGui.QTreeWidgetItem([unicode(Counter)] + Item.ToTreeWidgetItemList())
            Item.setCheckState(0 , QtCore.Qt.Checked)
            self.PatternsTree.addTopLevelItem(Item)
            Counter += 1
    
    
    def LoadUrls(self):
        UrlsCount = len(self.Urls)
        if self.Project.NumOfSimPages == 0:#default number of sim urls is 3
            Limit = 3 - UrlsCount
        elif self.Project.NumOfSimPages > UrlsCount:
            Limit = self.Project.NumOfSimPages - UrlsCount #load new urls to match sim urls num
        else:
            if not self.ElapsedTimerStarted and self.Project.NumOfSimPages < UrlsCount: #Project is not statrted yet
                #NumOfExtras = UrlsCount - self.Project.NumOfSimPages
                NowUrls = UrlsCount - 1# number of urls now browsing
                while (UrlsCount != self.Project.NumOfSimPages):
                    self.UrlsTree.takeTopLevelItem(NowUrls) #delete extra urls from tree and self.Urls because of number of sim urls
                    del self.Urls[-1]
                    NowUrls -= 1
                self.ReArrangeUrlsNum()#correct number of urls in tree
            return
        #fetch new urls from db
        NewItems = [url for url in QueueUrl.ConvertFromDatabaseFetchList(QueueUrl.FetchDBUrls(self.Cursor , Limit , self.MaxUrlID)  , self) if url not in self.Urls]
        self.Urls.extend(NewItems)
        self.NumOfTotalUrls = QueueUrl.CountUrls(self.Cursor) #total number of urls
        CounterStart = UrlsCount + 1 #number of new urls in tree
        for Item in NewItems:
            Item = QtGui.QTreeWidgetItem([unicode(CounterStart)]  + Item.ToTreeWidgetItemList())
            Item.setCheckState(0 , QtCore.Qt.Checked)
            self.UrlsTree.addTopLevelItem(Item)
            CounterStart += 1
        self.ReArrangeUrlsNum()
        
        
    #correcting urls number in tree
    def ReArrangeUrlsNum(self):
        counterStart = 1
        for itemIndex in range(self.UrlsTree.topLevelItemCount()):
            item = self.UrlsTree.topLevelItem(itemIndex)
            item.setText(0 , unicode(counterStart))
            counterStart += 1

    
    #loading project attributes and stablishing db connection and init ui objects with those
    def LoadProject(self):
        try:
            #establishing connection
            DatabaseConnection = StablishDatabaseConnection.GetConnection(self.Project , self)
            DatabaseConnection.isolation_level = None
            self.DatabaseConnection = DatabaseConnection
            self.Cursor = DatabaseConnection.cursor()
            #set the sign for using in queries based on selected db
            self.Project.RepLiter = "?" if self.Project.DBEngine == "sqlite" else '%s'
        except DatabaseConnectionStablishError:
            QtGui.QMessageBox.critical(self , ProjectTitle , u"Database connection could not be stablished!")
            return
        except (DBNotExistAndNoCreate , AttributeError):
            raise
            return
        
        self.Urls = []#reinit urls
        #self.Patterns = []
        CreateDatabaseTables.CreateTables(self.Cursor, self.Project, self)#creating db tables if not exists
        self.UrlsTree.clear()
        self.NumOfSimPagesChanged() # For Setting Max Sim Pages
        self.LoadPatterns()
        self.LoadUrls()
        self.NewPatternTxt.clear()
        self.NewUrlTxt.clear()
        #init labels for timer and stats
        self.NumOfBrowsedPagesLbl.setText(self.Project.NumOfBrowsedPages)
        self.NumOfFetchedPatternsLbl.setText(self.Project.NumOfFetchedPatterns)
        self.NumOfQueueItemsLbl.setText(self.Project.NumOfQueueItems)
        if self.Project.ElapesdTime == "0":#project not started yet
            self.ElapesdTimeLbl.setText("00 : 00 : 00")
            self.ElapedHour_Project = self.ElapedMinute_Project = self.ElapedSeconds_Project = 0
        else:#get elapsed time from project file
            self.ElapesdTimeLbl.setText(self.Project.ElapesdTime)
            ElapsedTime = self.Project.ElapesdTime.split(":")
            self.ElapedHour_Project , self.ElapedMinute_Project , self.ElapedSeconds_Project = int(ElapsedTime[0]) , int(ElapsedTime[1]) , int(ElapsedTime[2])
        
        if self.Project.AllPatternsTogether == "0":
            self.AllPatternsChk.setChecked(False)
        else:
            self.AllPatternsChk.setChecked(True)
            
        if self.Project.NumOfSimPages == 0:
            self.NumOfSimPagesChk.setChecked(False)
            self.NumOfSimPagesSpn.setValue(1)
        else:
            self.NumOfSimPagesChk.setChecked(True)
            self.NumOfSimPagesSpn.setValue(int(self.Project.NumOfSimPages))
        
        self.Project.MaxBrowsingPages = int(self.Project.MaxBrowsingPages)#maximum number of pages to browse
        self.groupBox.setEnabled(True)
        self.groupBox_2.setEnabled(True)
        self.groupBox_3.setEnabled(True)
        self.SetButtonsState(Stopped = True)
        
        
    def PatternsItemClicked(self):
        #when user clicks an object in patterns tree this method checks if treewidget checkbox is checked or not
        Item = self.PatternsTree.currentItem()
        if Item is not None:
            if Item.checkState(0) == QtCore.Qt.Checked:
                if Item not in self.SelectedPatterns : self.SelectedPatterns.append(Item)#add pattern to selected patterns
            else:
                if Item in self.SelectedPatterns : self.SelectedPatterns.remove(Item)#remove patterns from selected patterns
        
        #set remove pattern button state based on any pattern selected
        if self.SelectedPatterns:
            self.RemoveSelectedPatternsBtn.setEnabled(True)
        else:
            self.RemoveSelectedPatternsBtn.setEnabled(False)
                
                
    def UrlItemClicked(self):
        #when user clicks an object in url tree this method checks if treewidget checkbox is checked or not
        Item = self.UrlsTree.currentItem()
        if Item is not None:
            if Item.checkState(0) == QtCore.Qt.Checked:
                if Item not in self.SelectedUrls : self.SelectedUrls.append(Item)#add url to selected patterns
            else:
                if Item in self.SelectedUrls : self.SelectedUrls.remove(Item)#remove url from selected patterns
        #set remove url button state based on urls selected
        if self.SelectedUrls:
            self.RemoveSelectedUrlsBtn.setEnabled(True)
        else:
            self.RemoveSelectedUrlsBtn.setEnabled(False)
    
    
    #when remove patterns btn clicked the patterns should be deleted from tree and SelectedPatterns list
    def RemoveSelectedPatterns(self):
        if len(self.Patterns) - len(self.SelectedPatterns) < 1:
            QtGui.QMessageBox.critical(self , ProjectTitle , u"Patterns list could not be empty")
        else:
            for Item in self.SelectedPatterns:
                Index = self.PatternsTree.indexOfTopLevelItem(Item)
                self.PatternsTree.takeTopLevelItem(Index)
                FetchPatterns.DeleteFromDB(self.Cursor, int(Item.text(3)) , self.Project.RepLiter) #Item.text(3) is id
                del self.Patterns[Index]
            self.LoadPatterns()
    
    
    #when remove url btn clicked the urls should be deleted from tree and Urls list
    def RemoveSelectedUrls(self):
        if len(self.Urls) - len(self.SelectedUrls) < 1:
            QtGui.QMessageBox.critical(self , ProjectTitle , u"Urls queue could not be empty")
        else:
            for Item in self.SelectedUrls:
                Index = self.UrlsTree.indexOfTopLevelItem(Item)
                self.UrlsTree.takeTopLevelItem(Index)
                UrlObj = QueueUrl(ID = unicode(Item.text(5)) , RepLiter = self.Project.RepLiter , Cursor = self.Cursor)
                UrlObj.DeleteFromDB()
                #QueueUrl.DeleteFromDB(self.Cursor, unicode(Item.text(5)) , self.Project.RepLiter)
                del self.Urls[Index]
            self.LoadUrls()
    
    
    #set  main buttons state relative to function parameter
    def SetButtonsState(self , Started = False , Paused = False , Stopped = False):
        if Started:
            self.StartCrawlingBtn.setEnabled(False)
            self.PauseCrawlingBtn.setEnabled(True)
            self.StopCrawlingBtn.setEnabled(True)
            
        elif Paused:
            self.StartCrawlingBtn.setEnabled(True)
            self.PauseCrawlingBtn.setEnabled(False)
            self.StopCrawlingBtn.setEnabled(True)
        
        elif Stopped:
            self.StartCrawlingBtn.setEnabled(True)
            self.PauseCrawlingBtn.setEnabled(False)
            self.StopCrawlingBtn.setEnabled(False)
        
        
    #terminating crawling threads and changing buttons state when crawler is stopped
    def StopCrawler(self):
        self.SetButtonsState(Stopped = True)
        self.ElapsedTimer.emit(QtCore.SIGNAL("StopTimer()"))
        self.ElapsedTimerStarted = self.IsCrawling = False
        for Thread in self.EngineThreads:
            Thread.terminate()
        
        
    #starting crawling threads and changing buttons state when crawler is started and running the timer
    def StartCrawler(self):
        self.SetButtonsState(Started = True)
        for url in self.Urls:
            if url.ID in self.CurrentCrawlingUrls: #prevent duplicate url browsing
                continue
            self.CurrentCrawlingUrls.append(url.ID)
            self.EngineThreads.append(Engine(Url = url , 
                               PatternsList = self.Patterns , 
                               RepLiter = self.Project.RepLiter,
                               Parent = self,
                               DBEngine = self.Project.DBEngine ,
                               Connection = self.DatabaseConnection ,
                               Cursor = self.Cursor ,
                               AllToGether = self.AllPatternsChk.isChecked()
                               ))
            #initialize and start thread
            self.EngineThreads[-1].start()
            
        self.IsCrawling = True
        if not self.ElapsedTimerStarted:
            self.ElapsedTimerStarted = True
            if hasattr(self, "ElapsedTimer"):
                self.ElapsedTimer = Timer(self , Hour = self.ElapsedTimer.Hour , 
                                          Minute = self.ElapsedTimer.Minute , 
                                          Second = self.ElapsedTimer.Second)
            else:
                self.ElapsedTimer = Timer(self , self.ElapedHour_Project , 
                                          self.ElapedMinute_Project ,
                                           self.ElapedSeconds_Project)
            self.ElapsedTimer.start()
            
    
    #loading new urls based on the number of simultaneous browsing pages changed
    def NumOfSimPagesChanged(self):
        if self.NumOfSimPagesChk.isChecked():
            self.Project.NumOfSimPages = self.NumOfSimPagesSpn.value()
        else:
            self.Project.NumOfSimPages = 0
        self.LoadUrls()
        ProjectClass.Save(self.Project)
            
            
    #new pattern textbox value changed        
    def NewPatternChanged(self):
        Pattern = unicode(self.NewPatternTxt.text()).strip()
        if Pattern:
            self.AddPatternBtn.setEnabled(True)
        else:
            self.AddPatternBtn.setEnabled(False)
           
            
    #new url textbox value changed  
    def NewUrlChanged(self):
        Url = unicode(self.NewUrlTxt.text()).strip()
        if match(UrlRegex, Url):
            self.AddUrlBtn.setEnabled(True)
        else:
            self.AddUrlBtn.setEnabled(False)
    
    
    #save new pattern into db and add it to tree
    def AddNewPatternToList(self):
        PatternTxt = unicode(self.NewPatternTxt.text()).strip()
        Pattern = FetchPatterns(PatternTxt , Cursor=self.Cursor , RepLiter = self.Project.RepLiter)
        if Pattern.IsDuplicate():#checking for duplicate pattern
            QtGui.QMessageBox.critical(self , ProjectTitle , u"This pattern already inserted into patterns list")
            return
        Pattern.SaveInDB()#save pattern in db
        self.LoadPatterns()#load patterns again
        self.NewPatternTxt.clear()
        QtGui.QMessageBox.information(self , ProjectTitle , u"Pattern successfully inserted into patterns list")
    
    
    def AddNewUrlToList(self):
        UrlTxt = unicode(self.NewUrlTxt.text()).strip()
        Url = QueueUrl(UrlTxt , Cursor = self.Cursor , RepLiter = self.Project.RepLiter)
        if Url.IsDuplicate():#check for duplicate url
            QtGui.QMessageBox.critical(self , ProjectTitle , u"This url already inserted into urls list")
            return
        
        Url.SaveInDB()#save url in db
        self.LoadUrls()#try to load urls
        self.NewUrlTxt.clear()
        QtGui.QMessageBox.information(self , ProjectTitle , u"Url successfully inserted into urls list")
        
    
    
    def SetNumOfUrlsFound(self , UrlID , NumOfFound):
        #updating url tree item with number of new urls found in that page
        items = self.UrlsTree.findItems(UrlID, QtCore.Qt.MatchExactly, 5)
        if items:
            items[0].setText(4 , NumOfFound)
            if NumOfFound != u"0":
                self.NumOfUrlsInQueueForStats += 1
                self.UpdateCrawlerStats()#updating crawler stats
                
        

    def SetNumOfFoundPatternsForUrl(self , UrlID , NumOfFound):
        #updating url tree item with number of patterns found in that page
        items = self.UrlsTree.findItems(UrlID, QtCore.Qt.MatchExactly, 5)
        if items:
            items[0].setText(3 , NumOfFound)
            if NumOfFound != u"0":
                self.NumOfPatternsForStats += 1
                self.UpdateCrawlerStats()
                
    

    def SetNumOfFoundPatternsForPattern(self , PatternID):
        #updating pattern tree item with number of found pattern in all pages
        items = self.PatternsTree.findItems(PatternID, QtCore.Qt.MatchExactly, 3)
        if items:
            items[0].setText(2 , unicode(int(items[0].text(2)) + 1))
        
    
    def SetPageState(self , UrlID , State):
        #when a page crawling is in process this method get the state from thread and set it in the tree
        items = self.UrlsTree.findItems(UrlID, QtCore.Qt.MatchExactly, 5)
        if items:
            items[0].setText(2 , State)
            Index = self.UrlsTree.indexOfTopLevelItem(items[0])
            if State in ("Connection failed...","Finished"):
                #remove item from tree if connection failed or page processing finished
                del self.Urls[Index]
                self.UrlsTree.takeTopLevelItem(Index)
                self.CurrentCrawlingUrls.remove(int(UrlID))
                
                if State == "Finished":
                    self.NumOfBrowsedUrlsForStats += 1
                    self.UpdateCrawlerStats()
                
                self.LoadUrls()#load new urls
                QtCore.QTimer.singleShot(100, self.StartCrawler)#start browsing new urls
            
                
            
    def SetElapsedTime(self , Time):#set elapsed time
        self.ElapesdTimeLbl.setText(Time)
    

    def UpdateCrawlerStats(self):#update crawler stats
        self.NumOfBrowsedPagesLbl.setText(unicode(self.NumOfBrowsedUrlsForStats))
        self.NumOfFetchedPatternsLbl.setText(unicode(self.NumOfPatternsForStats))
        self.NumOfQueueItemsLbl.setText(unicode(self.NumOfUrlsInQueueForStats))
        
    
    def closeEvent(self, event):
        event.ignore()
        self.checkAndExit()
    
    
    def checkAndExit(self):
        if self.IsCrawling:
            if QtGui.QMessageBox.question(self, "exit", "Crawling process is running. do you want to stop it and exit?", QtGui.QMessageBox.Yes|QtGui.QMessageBox.No, QtGui.QMessageBox.No) == QtGui.QMessageBox.Yes:
                self.StopCrawler()
            else:
                return
        exit(0)
            
    
        
    def __init__(self):
        #init the gui
        super(PoriaSimpleCrawlerMain, self).__init__()
        super(PoriaSimpleCrawlerMain, self).setupUi(self)
        self.setWindowState(QtCore.Qt.WindowMaximized)
        self.groupBox.setEnabled(False)
        self.groupBox_2.setEnabled(False)
        self.groupBox_3.setEnabled(False)
        #end of init gui
        
        self.MaxUrlID = 0 #maximum id for url to fetch from db
        self.SelectedUrls = [] #which urls are enabled and selected
        self.SelectedPatterns = [] #which patterns are enabled and selected to fetch
        self.CurrentCrawlingUrls = [] #current crawling urls
        self.EngineThreads = [] #crawling threads
        self.ElapsedTimerStarted = self.IsCrawling = False
        self.NumOfBrowsedUrlsForStats = self.NumOfPatternsForStats = self.NumOfUrlsInQueueForStats = 0
        
        
        #connecting signals and slots
        self.connect(self.actionNewProject , QtCore.SIGNAL("triggered()") , self.ShowNewProjectWizard)
        self.connect(self.actionOpenProject , QtCore.SIGNAL("triggered()") , self.ShowOpenFormDialog)
        self.connect(self.UrlsTree , QtCore.SIGNAL("itemClicked(QTreeWidgetItem* , int)") , self.UrlItemClicked)
        self.connect(self.PatternsTree , QtCore.SIGNAL("itemClicked(QTreeWidgetItem* , int)") , self.PatternsItemClicked)
        self.connect(self.NumOfSimPagesChk , QtCore.SIGNAL("toggled()") , self.NumOfSimPagesChanged)
        self.connect(self.NumOfSimPagesSpn , QtCore.SIGNAL("valueChanged(int)") , self.NumOfSimPagesChanged)
        self.connect(self.NewPatternTxt , QtCore.SIGNAL("textChanged(QString)") , self.NewPatternChanged)
        self.connect(self.NewUrlTxt , QtCore.SIGNAL("textChanged(QString)") , self.NewUrlChanged)
        self.connect(self.RemoveSelectedUrlsBtn , QtCore.SIGNAL("clicked()") , self.RemoveSelectedUrls)
        self.connect(self.RemoveSelectedPatternsBtn , QtCore.SIGNAL("clicked()") , self.RemoveSelectedPatterns)
        self.connect(self.AddPatternBtn , QtCore.SIGNAL("clicked()") , self.AddNewPatternToList)
        self.connect(self.AddUrlBtn , QtCore.SIGNAL("clicked()") , self.AddNewUrlToList)
        self.connect(self , QtCore.SIGNAL("NumOfFoundUrls(QString , QString)") , self.SetNumOfUrlsFound)
        self.connect(self , QtCore.SIGNAL("NumOfFoundPatternsForUrl(QString , QString)") , self.SetNumOfFoundPatternsForUrl)
        self.connect(self , QtCore.SIGNAL("IncreaseFoundPatternsForPattern(QString)") , self.SetNumOfFoundPatternsForPattern)
        self.connect(self.StartCrawlingBtn , QtCore.SIGNAL("clicked()") , self.StartCrawler)
        self.connect(self.StopCrawlingBtn , QtCore.SIGNAL("clicked()") , self.StopCrawler)
        self.connect(self , QtCore.SIGNAL("PageState(QString , QString)") , self.SetPageState)
        self.connect(self , QtCore.SIGNAL("GetElapsedTime(QString)") , self.SetElapsedTime)
        self.connect(self.actionQuit , QtCore.SIGNAL("triggered()") , self.checkAndExit)