Пример #1
0
    def _crawl(self, options):
        # ToDo: allow more than just the images that are initially loaded on imgur
        starting_page_url = urlparse.urljoin(self.IMGUR_BASE_URL,
                                             self.url_path)
        starting_page_response = requests.get(starting_page_url)
        starting_page = StartingPage(starting_page_response)

        if not starting_page.links:
            sys.exit('No posts found on the specified starting page')

        # batch the links, so that not too many async request calls are made at once
        for link_batch in iterable_helper.batch(starting_page.links, 5):
            # ToDo: replace the set membership testing with a bloom filter when urls are persisted between runs
            new_links = filter(lambda l: l not in self.visited_links,
                               link_batch)
            rs = (grequests.get(link) for link in new_links)
            responses = grequests.map(rs)

            while len(responses):
                response = responses.pop()
                post = Post(response)
                self.visited_links.add(post.url)

                if self._meets_criteria(post):
                    self.posts.append(post)
Пример #2
0
def download_images(file_names, progress_reporter=None):
    mkdir_p(DOWNLOAD_PATH)

    existing_files = set()
    for name in file_names:
        if os.path.isfile(generate_file_path(name)):
            existing_files.add(name)
    urls_to_download = [
        'http://i.imgur.com/' + name
        for name in set(file_names) - existing_files
    ]

    downloaded = 0
    # batch the image requests, so that not too many async request calls are made at once
    for batch in iterable_helper.batch(urls_to_download, 5):
        requests = (grequests.get(url) for url in batch)
        responses = grequests.map(requests)

        for response in responses:
            path = generate_file_path(response.url.split('/')[-1])
            with open(path, 'w') as f:
                f.write(response.content)
            downloaded += 1
            if progress_reporter:
                progress_reporter(downloaded)
Пример #3
0
def download_images(file_names, progress_reporter=None):
    mkdir_p(DOWNLOAD_PATH)

    existing_files = set()
    for name in file_names:
        if os.path.isfile(generate_file_path(name)):
            existing_files.add(name)
    urls_to_download = ['http://i.imgur.com/' + name for name in set(file_names) - existing_files]

    downloaded = 0
    # batch the image requests, so that not too many async request calls are made at once
    for batch in iterable_helper.batch(urls_to_download, 5):
        requests = (grequests.get(url) for url in batch)
        responses = grequests.map(requests)

        for response in responses:
            path = generate_file_path(response.url.split('/')[-1])
            with open(path, 'w') as f:
                f.write(response.content)
            downloaded += 1
            if progress_reporter:
                progress_reporter(downloaded)
Пример #4
0
    def _crawl(self, options):
        # ToDo: allow more than just the images that are initially loaded on imgur
        starting_page_url = urlparse.urljoin(self.IMGUR_BASE_URL, self.url_path)
        starting_page_response = requests.get(starting_page_url)
        starting_page = StartingPage(starting_page_response)

        if not starting_page.links:
            sys.exit('No posts found on the specified starting page')

        # batch the links, so that not too many async request calls are made at once
        for link_batch in iterable_helper.batch(starting_page.links, 5):
            # ToDo: replace the set membership testing with a bloom filter when urls are persisted between runs
            new_links = filter(lambda l: l not in self.visited_links, link_batch)
            rs = (grequests.get(link) for link in new_links)
            responses = grequests.map(rs)

            while len(responses):
                response = responses.pop()
                post = Post(response)
                self.visited_links.add(post.url)

                if self._meets_criteria(post):
                    self.posts.append(post)