Exemplo n.º 1
0
 def __init__(self, rddtDataExtractor, validUsersOrSubs, queue, listModelType):
     """
     The object that handles coordinating the download of the submission content. Spawns threads to download
     from users / subreddits simultaneously and to download images simultaneously.
     :type rddtDataExtractor: RedditDataExtractor.redditDataExtractor.RedditDataExtractor
     :type validUsersOrSubs: list
     :type queue: Queue.queue
     :type listModelType: RedditDataExtractor.redditDataExtractor.ListType
     """
     super().__init__()
     self._rddtDataExtractor = rddtDataExtractor
     self._validUsersOrSubs = validUsersOrSubs
     self._queue = queue
     self._listModelType = listModelType
     self._dataPool = QThreadPool()
     self._dataPool.setMaxThreadCount(4)
     self._continueOperation = True
     self.finishSignalForTest = False
Exemplo n.º 2
0
    def __init__(self, rddtDataExtractor, lstModelObj, validatedPRAWUserOrSub, queue, lstModelType, isStopped):
        """
        Thread to download for a submission. Spawns more threads for downloading images or submission json data
        :param lstModelObj: The User or Subreddit "ListModel" Object
        :type rddtDataExtractor: RedditDataExtractor.redditDataExtractor.RedditDataExtractor
        :type lstModelObj: RedditDataExtractor.GUI.genericListModelObjects.GenericListModelObj
        :type validatedPRAWUserOrSub: praw.objects.Subreddit or praw.objects.User
        :type queue: Queue.queue
        :type lstModelType: RedditDataExtractor.redditDataExtractor.ListType
        :type isStopped: function
        """
        super().__init__()

        self._rddtDataExtractor = rddtDataExtractor
        self._lstModelObj = lstModelObj
        self._validatedPRAWUserOrSub = validatedPRAWUserOrSub
        self._queue = queue
        self._lstModelType = lstModelType
        self._imagePool = QThreadPool()
        self._imagePool.setMaxThreadCount(3)
        self._submissionPool = QThreadPool()
        self._submissionPool.setMaxThreadCount(3)
        self._videoPool = QThreadPool()
        self._videoPool.setMaxThreadCount(2)
        self._mostRecentDownloadTimestamp = None
        self._downloaderIsStopped = isStopped
Exemplo n.º 3
0
class Downloader(QObject):
    finished = pyqtSignal()

    def __init__(self, rddtDataExtractor, validUsersOrSubs, queue, listModelType):
        """
        The object that handles coordinating the download of the submission content. Spawns threads to download
        from users / subreddits simultaneously and to download images simultaneously.
        :type rddtDataExtractor: RedditDataExtractor.redditDataExtractor.RedditDataExtractor
        :type validUsersOrSubs: list
        :type queue: Queue.queue
        :type listModelType: RedditDataExtractor.redditDataExtractor.ListType
        """
        super().__init__()
        self._rddtDataExtractor = rddtDataExtractor
        self._validUsersOrSubs = validUsersOrSubs
        self._queue = queue
        self._listModelType = listModelType
        self._dataPool = QThreadPool()
        self._dataPool.setMaxThreadCount(4)
        self._continueOperation = True
        self.finishSignalForTest = False

    def stop(self):
        self._continueOperation = False

    def isStopped(self):
        return not self._continueOperation

    @pyqtSlot()
    def run(self):
        self.finishSignalForTest = False
        self._rddtDataExtractor.currentlyDownloading = True
        if len(self._validUsersOrSubs) > 0:
            for lstModelObj, validatedPRAWUserOrSub in self._validUsersOrSubs:
                worker = Worker(self._rddtDataExtractor, lstModelObj, validatedPRAWUserOrSub, self._queue,
                                self._listModelType, self.isStopped)
                self._dataPool.start(worker)
            self._dataPool.waitForDone()
        self.finished.emit()
        self.finishSignalForTest = True
Exemplo n.º 4
0
class Worker(QRunnable):
    def __init__(self, rddtDataExtractor, lstModelObj, validatedPRAWUserOrSub, queue, lstModelType, isStopped):
        """
        Thread to download for a submission. Spawns more threads for downloading images or submission json data
        :param lstModelObj: The User or Subreddit "ListModel" Object
        :type rddtDataExtractor: RedditDataExtractor.redditDataExtractor.RedditDataExtractor
        :type lstModelObj: RedditDataExtractor.GUI.genericListModelObjects.GenericListModelObj
        :type validatedPRAWUserOrSub: praw.objects.Subreddit or praw.objects.User
        :type queue: Queue.queue
        :type lstModelType: RedditDataExtractor.redditDataExtractor.ListType
        :type isStopped: function
        """
        super().__init__()

        self._rddtDataExtractor = rddtDataExtractor
        self._lstModelObj = lstModelObj
        self._validatedPRAWUserOrSub = validatedPRAWUserOrSub
        self._queue = queue
        self._lstModelType = lstModelType
        self._imagePool = QThreadPool()
        self._imagePool.setMaxThreadCount(3)
        self._submissionPool = QThreadPool()
        self._submissionPool.setMaxThreadCount(3)
        self._videoPool = QThreadPool()
        self._videoPool.setMaxThreadCount(2)
        self._mostRecentDownloadTimestamp = None
        self._downloaderIsStopped = isStopped

    def _startDownloadsForSubmission(self, submission):
        """
        :type submission: praw.objects.Submission
        """
        if self._rddtDataExtractor.getExternalContent and self._lstModelObj.isNewContent(submission,
                                                                                         DownloadedContentType.EXTERNAL_SUBMISSION_DATA) and not submission.is_self and not "reddit" in submission.domain:
            downloadedContent = DownloadedContent(submission.permalink, DownloadedContentType.EXTERNAL_SUBMISSION_DATA)
            images = self._rddtDataExtractor.getImages(submission, self._lstModelObj, self._queue)
            self._startDownloadImages(images, downloadedContent, submission)
            if not self._rddtDataExtractor.avoidVideos:
                videos = self._rddtDataExtractor.getVideos(submission, self._lstModelObj)
                self._startDownloadVideos(videos, downloadedContent, submission)
        if self._rddtDataExtractor.getCommentExternalContent and self._lstModelObj.isNewContent(submission,
                                                                                                DownloadedContentType.EXTERNAL_COMMENT_DATA):
            downloadedContent = DownloadedContent(submission.permalink, DownloadedContentType.EXTERNAL_COMMENT_DATA)
            images = self._rddtDataExtractor.getCommentImages(submission, self._lstModelObj, self._queue)
            self._startDownloadImages(images, downloadedContent, submission)
            if not self._rddtDataExtractor.avoidVideos:
                videos = self._rddtDataExtractor.getCommentVideos(submission, self._lstModelObj)
                self._startDownloadVideos(videos, downloadedContent, submission)
        if self._rddtDataExtractor.getSelftextExternalContent and self._lstModelObj.isNewContent(submission,
                                                                                                 DownloadedContentType.EXTERNAL_SELFTEXT_DATA):
            downloadedContent = DownloadedContent(submission.permalink, DownloadedContentType.EXTERNAL_SELFTEXT_DATA)
            images = self._rddtDataExtractor.getSelftextImages(submission, self._lstModelObj, self._queue)
            self._startDownloadImages(images, downloadedContent, submission)
            if not self._rddtDataExtractor.avoidVideos:
                videos = self._rddtDataExtractor.getSelftextVideos(submission, self._lstModelObj)
                self._startDownloadVideos(videos, downloadedContent, submission)
        if self._rddtDataExtractor.getSubmissionContent and self._lstModelObj.isNewContent(submission,
                                                                                           DownloadedContentType.JSON_DATA):
            if not self._downloaderIsStopped():
                downloadedContent = DownloadedContent(submission.permalink, DownloadedContentType.JSON_DATA)
                submissionWorker = SubmissionWorker(self._rddtDataExtractor, self._lstModelObj, submission, self._queue,
                                                    downloadedContent, self._lstModelType,
                                                    self.setMostRecentDownloadTimestamp, self._downloaderIsStopped)
                self._submissionPool.start(submissionWorker)

    def _startDownloadImages(self, images, downloadedContent, submission):
        """
        :type: images: generator
        :type downloadedContent: DownloadedContent
        :type: submission: praw.objects.Submission
        """
        if images is not None:
            for image in images:
                if self._downloaderIsStopped():
                    break
                elif image is not None:
                    imageWorker = ImageWorker(image, self._lstModelObj, submission, self._queue, downloadedContent,
                                              self._rddtDataExtractor.avoidDuplicates,
                                              self.setMostRecentDownloadTimestamp, self._downloaderIsStopped)
                    self._imagePool.start(imageWorker)

    def _startDownloadVideos(self, videos, downloadedContent, submission):
        """
        :type: videos: generator
        :type downloadedContent: DownloadedContent
        """
        for video in videos:
            if self._downloaderIsStopped():
                break
            elif video is not None:
                videoWorker = VideoWorker(video, self._lstModelObj, submission, self._queue, downloadedContent,
                                          self._rddtDataExtractor.avoidDuplicates, self.setMostRecentDownloadTimestamp,
                                          self._downloaderIsStopped)
                self._videoPool.start(videoWorker)

    def run(self):
        if not self._downloaderIsStopped():
            name = self._lstModelObj.name
            self._queue.put("Starting download for " + name + "\n")
            self._rddtDataExtractor.makeDirectory(name)
            if self._lstModelType is ListType.SUBREDDIT:
                submitted = self._rddtDataExtractor.getSubredditSubmissions(self._validatedPRAWUserOrSub)
            else:
                submitted = self._validatedPRAWUserOrSub.get_submitted(limit=None)
            submissions = self._rddtDataExtractor.getValidSubmissions(submitted, self._lstModelObj)
            for submission, passesFilter in submissions:
                if passesFilter:
                    self._startDownloadsForSubmission(submission)
            self._imagePool.waitForDone()
            self._submissionPool.waitForDone()
            self._videoPool.waitForDone()
            self._lstModelObj.mostRecentDownloadTimestamp = self._mostRecentDownloadTimestamp
            self._queue.put("Finished download for " + name + "\n")

    def setMostRecentDownloadTimestamp(self, utc):
        """
        As the various threads download submissions, this keeps track of the most recent (by creation date) one.
        Then, when ALL downloads are finished, it sets the lstModelObj's mostRecentDownloadTimestamp. This
        allows submissions to be downloaded out of order in a download session, and still be able to prevent
        downloads from older time periods unless the user specifies they don't want that behavior.

        :type utc: float
        """
        if self._mostRecentDownloadTimestamp is None or utc > self._mostRecentDownloadTimestamp:
            self._mostRecentDownloadTimestamp = utc