def StartCrawler(self): self.SetButtonsState(Started = True) for url in self.Urls: if url.ID in self.CurrentCrawlingUrls: #prevent duplicate url browsing continue self.CurrentCrawlingUrls.append(url.ID) self.EngineThreads.append(Engine(Url = url , PatternsList = self.Patterns , RepLiter = self.Project.RepLiter, Parent = self, DBEngine = self.Project.DBEngine , Connection = self.DatabaseConnection , Cursor = self.Cursor , AllToGether = self.AllPatternsChk.isChecked() )) #initialize and start thread self.EngineThreads[-1].start() self.IsCrawling = True if not self.ElapsedTimerStarted: self.ElapsedTimerStarted = True if hasattr(self, "ElapsedTimer"): self.ElapsedTimer = Timer(self , Hour = self.ElapsedTimer.Hour , Minute = self.ElapsedTimer.Minute , Second = self.ElapsedTimer.Second) else: self.ElapsedTimer = Timer(self , self.ElapedHour_Project , self.ElapedMinute_Project , self.ElapedSeconds_Project) self.ElapsedTimer.start()
class PoriaSimpleCrawlerMain(QtGui.QMainWindow , Ui_PoriaSimpleCrawler): def ShowNewProjectWizard(self): self.NewProjectWizard = PoriaCrawlerNewProjectWizard(self) #instantiate new proj wizard self.NewProjectWizard.show() def ShowOpenFormDialog(self): #this method shows open dialog for selecting project file ProjectFilePath = QtGui.QFileDialog.getOpenFileNameAndFilter(parent=self, caption=u"Open Project" , filter = "Poria's Simple Crawler | (*.psc)")[0] if ProjectFilePath: try: self.Project = ProjectClass.LoadFromFile(ProjectFilePath)#loading project file to self.Project except InvalidProjectFile:# if file is not vaild and exception will raises QtGui.QMessageBox.critical(self , ProjectTitle , u"Project file is not valid!") return self.LoadProject()#establishing db connection and load project attributes def LoadPatterns(self): self.Patterns = FetchPatterns.ConvertFromDatabaseFetchList(FetchPatterns.FetchDBPatterns(self.Cursor)) #fetching patterns from db self.PatternsTree.clear() #clearing patterns tree Counter = 1 for Item in self.Patterns: #adding patterns to tree Item = QtGui.QTreeWidgetItem([unicode(Counter)] + Item.ToTreeWidgetItemList()) Item.setCheckState(0 , QtCore.Qt.Checked) self.PatternsTree.addTopLevelItem(Item) Counter += 1 def LoadUrls(self): UrlsCount = len(self.Urls) if self.Project.NumOfSimPages == 0:#default number of sim urls is 3 Limit = 3 - UrlsCount elif self.Project.NumOfSimPages > UrlsCount: Limit = self.Project.NumOfSimPages - UrlsCount #load new urls to match sim urls num else: if not self.ElapsedTimerStarted and self.Project.NumOfSimPages < UrlsCount: #Project is not statrted yet #NumOfExtras = UrlsCount - self.Project.NumOfSimPages NowUrls = UrlsCount - 1# number of urls now browsing while (UrlsCount != self.Project.NumOfSimPages): self.UrlsTree.takeTopLevelItem(NowUrls) #delete extra urls from tree and self.Urls because of number of sim urls del self.Urls[-1] NowUrls -= 1 self.ReArrangeUrlsNum()#correct number of urls in tree return #fetch new urls from db NewItems = [url for url in QueueUrl.ConvertFromDatabaseFetchList(QueueUrl.FetchDBUrls(self.Cursor , Limit , self.MaxUrlID) , self) if url not in self.Urls] self.Urls.extend(NewItems) self.NumOfTotalUrls = QueueUrl.CountUrls(self.Cursor) #total number of urls CounterStart = UrlsCount + 1 #number of new urls in tree for Item in NewItems: Item = QtGui.QTreeWidgetItem([unicode(CounterStart)] + Item.ToTreeWidgetItemList()) Item.setCheckState(0 , QtCore.Qt.Checked) self.UrlsTree.addTopLevelItem(Item) CounterStart += 1 self.ReArrangeUrlsNum() #correcting urls number in tree def ReArrangeUrlsNum(self): counterStart = 1 for itemIndex in range(self.UrlsTree.topLevelItemCount()): item = self.UrlsTree.topLevelItem(itemIndex) item.setText(0 , unicode(counterStart)) counterStart += 1 #loading project attributes and stablishing db connection and init ui objects with those def LoadProject(self): try: #establishing connection DatabaseConnection = StablishDatabaseConnection.GetConnection(self.Project , self) DatabaseConnection.isolation_level = None self.DatabaseConnection = DatabaseConnection self.Cursor = DatabaseConnection.cursor() #set the sign for using in queries based on selected db self.Project.RepLiter = "?" if self.Project.DBEngine == "sqlite" else '%s' except DatabaseConnectionStablishError: QtGui.QMessageBox.critical(self , ProjectTitle , u"Database connection could not be stablished!") return except (DBNotExistAndNoCreate , AttributeError): raise return self.Urls = []#reinit urls #self.Patterns = [] CreateDatabaseTables.CreateTables(self.Cursor, self.Project, self)#creating db tables if not exists self.UrlsTree.clear() self.NumOfSimPagesChanged() # For Setting Max Sim Pages self.LoadPatterns() self.LoadUrls() self.NewPatternTxt.clear() self.NewUrlTxt.clear() #init labels for timer and stats self.NumOfBrowsedPagesLbl.setText(self.Project.NumOfBrowsedPages) self.NumOfFetchedPatternsLbl.setText(self.Project.NumOfFetchedPatterns) self.NumOfQueueItemsLbl.setText(self.Project.NumOfQueueItems) if self.Project.ElapesdTime == "0":#project not started yet self.ElapesdTimeLbl.setText("00 : 00 : 00") self.ElapedHour_Project = self.ElapedMinute_Project = self.ElapedSeconds_Project = 0 else:#get elapsed time from project file self.ElapesdTimeLbl.setText(self.Project.ElapesdTime) ElapsedTime = self.Project.ElapesdTime.split(":") self.ElapedHour_Project , self.ElapedMinute_Project , self.ElapedSeconds_Project = int(ElapsedTime[0]) , int(ElapsedTime[1]) , int(ElapsedTime[2]) if self.Project.AllPatternsTogether == "0": self.AllPatternsChk.setChecked(False) else: self.AllPatternsChk.setChecked(True) if self.Project.NumOfSimPages == 0: self.NumOfSimPagesChk.setChecked(False) self.NumOfSimPagesSpn.setValue(1) else: self.NumOfSimPagesChk.setChecked(True) self.NumOfSimPagesSpn.setValue(int(self.Project.NumOfSimPages)) self.Project.MaxBrowsingPages = int(self.Project.MaxBrowsingPages)#maximum number of pages to browse self.groupBox.setEnabled(True) self.groupBox_2.setEnabled(True) self.groupBox_3.setEnabled(True) self.SetButtonsState(Stopped = True) def PatternsItemClicked(self): #when user clicks an object in patterns tree this method checks if treewidget checkbox is checked or not Item = self.PatternsTree.currentItem() if Item is not None: if Item.checkState(0) == QtCore.Qt.Checked: if Item not in self.SelectedPatterns : self.SelectedPatterns.append(Item)#add pattern to selected patterns else: if Item in self.SelectedPatterns : self.SelectedPatterns.remove(Item)#remove patterns from selected patterns #set remove pattern button state based on any pattern selected if self.SelectedPatterns: self.RemoveSelectedPatternsBtn.setEnabled(True) else: self.RemoveSelectedPatternsBtn.setEnabled(False) def UrlItemClicked(self): #when user clicks an object in url tree this method checks if treewidget checkbox is checked or not Item = self.UrlsTree.currentItem() if Item is not None: if Item.checkState(0) == QtCore.Qt.Checked: if Item not in self.SelectedUrls : self.SelectedUrls.append(Item)#add url to selected patterns else: if Item in self.SelectedUrls : self.SelectedUrls.remove(Item)#remove url from selected patterns #set remove url button state based on urls selected if self.SelectedUrls: self.RemoveSelectedUrlsBtn.setEnabled(True) else: self.RemoveSelectedUrlsBtn.setEnabled(False) #when remove patterns btn clicked the patterns should be deleted from tree and SelectedPatterns list def RemoveSelectedPatterns(self): if len(self.Patterns) - len(self.SelectedPatterns) < 1: QtGui.QMessageBox.critical(self , ProjectTitle , u"Patterns list could not be empty") else: for Item in self.SelectedPatterns: Index = self.PatternsTree.indexOfTopLevelItem(Item) self.PatternsTree.takeTopLevelItem(Index) FetchPatterns.DeleteFromDB(self.Cursor, int(Item.text(3)) , self.Project.RepLiter) #Item.text(3) is id del self.Patterns[Index] self.LoadPatterns() #when remove url btn clicked the urls should be deleted from tree and Urls list def RemoveSelectedUrls(self): if len(self.Urls) - len(self.SelectedUrls) < 1: QtGui.QMessageBox.critical(self , ProjectTitle , u"Urls queue could not be empty") else: for Item in self.SelectedUrls: Index = self.UrlsTree.indexOfTopLevelItem(Item) self.UrlsTree.takeTopLevelItem(Index) UrlObj = QueueUrl(ID = unicode(Item.text(5)) , RepLiter = self.Project.RepLiter , Cursor = self.Cursor) UrlObj.DeleteFromDB() #QueueUrl.DeleteFromDB(self.Cursor, unicode(Item.text(5)) , self.Project.RepLiter) del self.Urls[Index] self.LoadUrls() #set main buttons state relative to function parameter def SetButtonsState(self , Started = False , Paused = False , Stopped = False): if Started: self.StartCrawlingBtn.setEnabled(False) self.PauseCrawlingBtn.setEnabled(True) self.StopCrawlingBtn.setEnabled(True) elif Paused: self.StartCrawlingBtn.setEnabled(True) self.PauseCrawlingBtn.setEnabled(False) self.StopCrawlingBtn.setEnabled(True) elif Stopped: self.StartCrawlingBtn.setEnabled(True) self.PauseCrawlingBtn.setEnabled(False) self.StopCrawlingBtn.setEnabled(False) #terminating crawling threads and changing buttons state when crawler is stopped def StopCrawler(self): self.SetButtonsState(Stopped = True) self.ElapsedTimer.emit(QtCore.SIGNAL("StopTimer()")) self.ElapsedTimerStarted = self.IsCrawling = False for Thread in self.EngineThreads: Thread.terminate() #starting crawling threads and changing buttons state when crawler is started and running the timer def StartCrawler(self): self.SetButtonsState(Started = True) for url in self.Urls: if url.ID in self.CurrentCrawlingUrls: #prevent duplicate url browsing continue self.CurrentCrawlingUrls.append(url.ID) self.EngineThreads.append(Engine(Url = url , PatternsList = self.Patterns , RepLiter = self.Project.RepLiter, Parent = self, DBEngine = self.Project.DBEngine , Connection = self.DatabaseConnection , Cursor = self.Cursor , AllToGether = self.AllPatternsChk.isChecked() )) #initialize and start thread self.EngineThreads[-1].start() self.IsCrawling = True if not self.ElapsedTimerStarted: self.ElapsedTimerStarted = True if hasattr(self, "ElapsedTimer"): self.ElapsedTimer = Timer(self , Hour = self.ElapsedTimer.Hour , Minute = self.ElapsedTimer.Minute , Second = self.ElapsedTimer.Second) else: self.ElapsedTimer = Timer(self , self.ElapedHour_Project , self.ElapedMinute_Project , self.ElapedSeconds_Project) self.ElapsedTimer.start() #loading new urls based on the number of simultaneous browsing pages changed def NumOfSimPagesChanged(self): if self.NumOfSimPagesChk.isChecked(): self.Project.NumOfSimPages = self.NumOfSimPagesSpn.value() else: self.Project.NumOfSimPages = 0 self.LoadUrls() ProjectClass.Save(self.Project) #new pattern textbox value changed def NewPatternChanged(self): Pattern = unicode(self.NewPatternTxt.text()).strip() if Pattern: self.AddPatternBtn.setEnabled(True) else: self.AddPatternBtn.setEnabled(False) #new url textbox value changed def NewUrlChanged(self): Url = unicode(self.NewUrlTxt.text()).strip() if match(UrlRegex, Url): self.AddUrlBtn.setEnabled(True) else: self.AddUrlBtn.setEnabled(False) #save new pattern into db and add it to tree def AddNewPatternToList(self): PatternTxt = unicode(self.NewPatternTxt.text()).strip() Pattern = FetchPatterns(PatternTxt , Cursor=self.Cursor , RepLiter = self.Project.RepLiter) if Pattern.IsDuplicate():#checking for duplicate pattern QtGui.QMessageBox.critical(self , ProjectTitle , u"This pattern already inserted into patterns list") return Pattern.SaveInDB()#save pattern in db self.LoadPatterns()#load patterns again self.NewPatternTxt.clear() QtGui.QMessageBox.information(self , ProjectTitle , u"Pattern successfully inserted into patterns list") def AddNewUrlToList(self): UrlTxt = unicode(self.NewUrlTxt.text()).strip() Url = QueueUrl(UrlTxt , Cursor = self.Cursor , RepLiter = self.Project.RepLiter) if Url.IsDuplicate():#check for duplicate url QtGui.QMessageBox.critical(self , ProjectTitle , u"This url already inserted into urls list") return Url.SaveInDB()#save url in db self.LoadUrls()#try to load urls self.NewUrlTxt.clear() QtGui.QMessageBox.information(self , ProjectTitle , u"Url successfully inserted into urls list") def SetNumOfUrlsFound(self , UrlID , NumOfFound): #updating url tree item with number of new urls found in that page items = self.UrlsTree.findItems(UrlID, QtCore.Qt.MatchExactly, 5) if items: items[0].setText(4 , NumOfFound) if NumOfFound != u"0": self.NumOfUrlsInQueueForStats += 1 self.UpdateCrawlerStats()#updating crawler stats def SetNumOfFoundPatternsForUrl(self , UrlID , NumOfFound): #updating url tree item with number of patterns found in that page items = self.UrlsTree.findItems(UrlID, QtCore.Qt.MatchExactly, 5) if items: items[0].setText(3 , NumOfFound) if NumOfFound != u"0": self.NumOfPatternsForStats += 1 self.UpdateCrawlerStats() def SetNumOfFoundPatternsForPattern(self , PatternID): #updating pattern tree item with number of found pattern in all pages items = self.PatternsTree.findItems(PatternID, QtCore.Qt.MatchExactly, 3) if items: items[0].setText(2 , unicode(int(items[0].text(2)) + 1)) def SetPageState(self , UrlID , State): #when a page crawling is in process this method get the state from thread and set it in the tree items = self.UrlsTree.findItems(UrlID, QtCore.Qt.MatchExactly, 5) if items: items[0].setText(2 , State) Index = self.UrlsTree.indexOfTopLevelItem(items[0]) if State in ("Connection failed...","Finished"): #remove item from tree if connection failed or page processing finished del self.Urls[Index] self.UrlsTree.takeTopLevelItem(Index) self.CurrentCrawlingUrls.remove(int(UrlID)) if State == "Finished": self.NumOfBrowsedUrlsForStats += 1 self.UpdateCrawlerStats() self.LoadUrls()#load new urls QtCore.QTimer.singleShot(100, self.StartCrawler)#start browsing new urls def SetElapsedTime(self , Time):#set elapsed time self.ElapesdTimeLbl.setText(Time) def UpdateCrawlerStats(self):#update crawler stats self.NumOfBrowsedPagesLbl.setText(unicode(self.NumOfBrowsedUrlsForStats)) self.NumOfFetchedPatternsLbl.setText(unicode(self.NumOfPatternsForStats)) self.NumOfQueueItemsLbl.setText(unicode(self.NumOfUrlsInQueueForStats)) def closeEvent(self, event): event.ignore() self.checkAndExit() def checkAndExit(self): if self.IsCrawling: if QtGui.QMessageBox.question(self, "exit", "Crawling process is running. do you want to stop it and exit?", QtGui.QMessageBox.Yes|QtGui.QMessageBox.No, QtGui.QMessageBox.No) == QtGui.QMessageBox.Yes: self.StopCrawler() else: return exit(0) def __init__(self): #init the gui super(PoriaSimpleCrawlerMain, self).__init__() super(PoriaSimpleCrawlerMain, self).setupUi(self) self.setWindowState(QtCore.Qt.WindowMaximized) self.groupBox.setEnabled(False) self.groupBox_2.setEnabled(False) self.groupBox_3.setEnabled(False) #end of init gui self.MaxUrlID = 0 #maximum id for url to fetch from db self.SelectedUrls = [] #which urls are enabled and selected self.SelectedPatterns = [] #which patterns are enabled and selected to fetch self.CurrentCrawlingUrls = [] #current crawling urls self.EngineThreads = [] #crawling threads self.ElapsedTimerStarted = self.IsCrawling = False self.NumOfBrowsedUrlsForStats = self.NumOfPatternsForStats = self.NumOfUrlsInQueueForStats = 0 #connecting signals and slots self.connect(self.actionNewProject , QtCore.SIGNAL("triggered()") , self.ShowNewProjectWizard) self.connect(self.actionOpenProject , QtCore.SIGNAL("triggered()") , self.ShowOpenFormDialog) self.connect(self.UrlsTree , QtCore.SIGNAL("itemClicked(QTreeWidgetItem* , int)") , self.UrlItemClicked) self.connect(self.PatternsTree , QtCore.SIGNAL("itemClicked(QTreeWidgetItem* , int)") , self.PatternsItemClicked) self.connect(self.NumOfSimPagesChk , QtCore.SIGNAL("toggled()") , self.NumOfSimPagesChanged) self.connect(self.NumOfSimPagesSpn , QtCore.SIGNAL("valueChanged(int)") , self.NumOfSimPagesChanged) self.connect(self.NewPatternTxt , QtCore.SIGNAL("textChanged(QString)") , self.NewPatternChanged) self.connect(self.NewUrlTxt , QtCore.SIGNAL("textChanged(QString)") , self.NewUrlChanged) self.connect(self.RemoveSelectedUrlsBtn , QtCore.SIGNAL("clicked()") , self.RemoveSelectedUrls) self.connect(self.RemoveSelectedPatternsBtn , QtCore.SIGNAL("clicked()") , self.RemoveSelectedPatterns) self.connect(self.AddPatternBtn , QtCore.SIGNAL("clicked()") , self.AddNewPatternToList) self.connect(self.AddUrlBtn , QtCore.SIGNAL("clicked()") , self.AddNewUrlToList) self.connect(self , QtCore.SIGNAL("NumOfFoundUrls(QString , QString)") , self.SetNumOfUrlsFound) self.connect(self , QtCore.SIGNAL("NumOfFoundPatternsForUrl(QString , QString)") , self.SetNumOfFoundPatternsForUrl) self.connect(self , QtCore.SIGNAL("IncreaseFoundPatternsForPattern(QString)") , self.SetNumOfFoundPatternsForPattern) self.connect(self.StartCrawlingBtn , QtCore.SIGNAL("clicked()") , self.StartCrawler) self.connect(self.StopCrawlingBtn , QtCore.SIGNAL("clicked()") , self.StopCrawler) self.connect(self , QtCore.SIGNAL("PageState(QString , QString)") , self.SetPageState) self.connect(self , QtCore.SIGNAL("GetElapsedTime(QString)") , self.SetElapsedTime) self.connect(self.actionQuit , QtCore.SIGNAL("triggered()") , self.checkAndExit)