Пример #1
0
    def test():
        from Httpy import Httpy
        httpy = Httpy()

        # Check we can hit the host
        url = 'http://twitter.com'
        r = httpy.get(url)
        if len(r.strip()) == 0:
            # Raise exception because the site is *very* broken, definitely can't rip from it if we can't hit the home page.
            raise Exception('unable to retrieve data from %s' % url)

        # Check ripper gets all images in an album
        url = SiteTwitter.get_sample_url()
        s = SiteTwitter(url)
        SiteTwitter.MAX_REQUESTS_PER_RIP = 1
        urls = s.get_urls()
        expected = 5
        if len(urls) < expected:
            # Returning non-None string since this may be a transient error.
            # Maybe the album was deleted but the ripper is working as expected.
            return 'expected at least %d images, got %d. url: %s' % (
                expected, len(urls), url)

        # Returning None because the ripper is working as expected. No issues found.
        return None
Пример #2
0
    def get_urls_user_albums(self):
        if self.url.endswith('/all'):
            # Images, not albums
            return self.get_urls_user_images()

        from Httpy import Httpy
        httpy = Httpy()

        user = self.url.split('//')[1].split('.')[0]
        r = httpy.get(self.url)
        result = []
        for (index, cover) in enumerate(
                httpy.between(r, '<div class="cover">', '</div>')):
            if not '<a href="' in cover: continue
            album = httpy.between(cover, '<a href="', '"')[0]
            if album.startswith('//'):
                album = 'http:%s' % album
            albumid = album.split('/')[4]
            album = 'http://imgur.com/a/%s' % albumid
            for image in self.get_urls_album(album):
                # Tack this album's index/albumid to image
                image['saveas'] = '%03d_%s_%s' % (index + 1, albumid,
                                                  image['saveas'])
                result.append(image)
            sleep(2)
            if len(result) > SiteBase.MAX_IMAGES_PER_RIP:
                break
        return result
Пример #3
0
	def get_urls(self):
		self.api_key = self.db.get_config('tumblr_key')
		if self.api_key == None:
			raise Exception('unable to rip album (%s), tumblr key not found in database' % self.url)

		from Httpy import Httpy
		httpy = Httpy()

		result = []
		offset = 0
		while True:
			url = self.get_api_url(offset=offset)
			r = httpy.get(url)
			json = loads(r)
			if not 'response' in json or not 'posts' in json['response']:
				#raise Exception('no posts found at %s' % self.url)
				break

			posts = json['response']['posts']
			if len(posts) == 0: break

			for post in posts:
				for photos in post['photos']:
					result.append(photos['original_size']['url'])
			if self.post_type == 'post': break
			if len(result) > SiteBase.MAX_IMAGES_PER_RIP:
				break
			offset += 20
			sleep(1)
		return result
Пример #4
0
    def get_urls_user_images(self):
        from Httpy import Httpy
        httpy = Httpy()

        result = []
        url = self.url.replace('/all', '')
        page = total = index = 0
        while True:
            page += 1
            next_page = '%s/ajax/images?sort=0&order=1&album=0&page=%d&perPage=60' % (
                url, page)
            r = httpy.get(next_page)
            json = loads(r)
            data = json['data']
            if total == 0 and 'count' in data:
                total = data['count']
            # TODO report progress
            for image in data['images']:
                result.append('http://i.imgur.com/%s%s' %
                              (image['hash'], image['ext']))
            if index >= total:
                break
            if len(result) > SiteBase.MAX_IMAGES_PER_RIP:
                break
            sleep(1)
        return result
Пример #5
0
	def get_urls_user_albums(self):
		if self.url.endswith('/all'):
			# Images, not albums
			return self.get_urls_user_images()

		from Httpy import Httpy
		httpy = Httpy()

		user = self.url.split('//')[1].split('.')[0]
		r = httpy.get(self.url)
		result = []
		for (index, cover) in enumerate(httpy.between(r, '<div class="cover">', '</div>')):
			if not '<a href="' in cover: continue
			album = httpy.between(cover, '<a href="', '"')[0]
			if album.startswith('//'):
				album = 'http:%s' % album
			albumid = album.split('/')[4]
			album = 'http://imgur.com/a/%s' % albumid
			for image in self.get_urls_album(album):
				# Tack this album's index/albumid to image
				image['saveas'] = '%03d_%s_%s' % (index + 1, albumid, image['saveas'])
				result.append(image)
			sleep(2)
			if len(result) > SiteBase.MAX_IMAGES_PER_RIP:
				break
		return result
Пример #6
0
    def test():
        '''
			Test that ripper is working as expected.
			Raise exception if necessary.
		'''
        from Httpy import Httpy
        httpy = Httpy()

        # Check we can hit the host
        url = 'http://imgur.com'
        r = httpy.get(url)
        if len(r.strip()) == 0:
            raise Exception('unable to retrieve data from %s' % url)

        # Check ripper gets all images in an album
        #url = 'http://markedone911.imgur.com/'
        #url = 'http://imgur.com/r/nsfw_oc/top/all'
        url = SiteImgur.get_sample_url()
        s = SiteImgur(url)
        urls = s.get_urls()
        for (i, u) in enumerate(urls):
            print i, u
        expected = 4
        if len(urls) < expected:
            return 'expected at least %d images, got %d. url: %s' % (
                expected, len(urls), url)
        return None
Пример #7
0
def search_img_url(query, distance):
    if ' ' in query:
        query = query.replace(' ', '%20')

    if not is_valid_url(query):
        raise Exception("Invalid query: '%s'" % query)

    try:
        hash = db.get_image_hash_from_url(url=query)

        if not hash:
            # Download image
            web = Httpy()
            try:
                image_buffer = web.download(url=query)
            except:
                raise Exception('unable to download image at %s' % query)

            try:
                im = image_from_buffer(image_buffer)
                hash = get_hash(im)
            except:
                raise Exception("Could not identify image")

        images = db.get_similar_images(hash, distance=distance)
        results = build_results_for_images(images)

    except Exception as e:
        return Response(json.dumps({'error': str(e)}), mimetype="application/json")

    return Response(results.json(), mimetype="application/json")
Пример #8
0
	def test():
		'''
			Test that ripper is working as expected.
			Raise exception if necessary.
		'''
		from Httpy import Httpy
		httpy = Httpy()

		# Check we can hit the host
		url = 'http://imgur.com'
		r = httpy.get(url)
		if len(r.strip()) == 0:
			raise Exception('unable to retrieve data from %s' % url)

		# Check ripper gets all images in an album
		#url = 'http://markedone911.imgur.com/'
		#url = 'http://imgur.com/r/nsfw_oc/top/all'
		url = SiteImgur.get_sample_url()
		s = SiteImgur(url)
		urls = s.get_urls()
		for (i,u) in enumerate(urls):
			print i, u
		expected = 4
		if len(urls) < expected:
			return 'expected at least %d images, got %d. url: %s' % (expected, len(urls), url)
		return None
Пример #9
0
    def test():
        '''
			Test that ripper is working as expected.
			StatusManager.py uses the results of this method to show what rippers are working/broken on the main page

			Returns:
				None - if ripper is working as expected
				str - Warning message if the ripper may not be working properly.

			Raises:
				Exception - if ripper is definitely broken. Exception message is used to display on site.
		'''
        from Httpy import Httpy
        httpy = Httpy()

        # Check we can hit the host
        url = 'http://hostname.com'
        r = httpy.get(url)
        if len(r.strip()) == 0:
            # Raise exception because the site is *very* broken, definitely can't rip from it if we can't hit the home page.
            raise Exception('unable to retrieve data from %s' % url)

        # Check ripper gets all images in an album
        url = _SampleSite.get_sample_url()
        s = _SampleSite(url)
        urls = s.get_urls()
        expected = 10
        if len(urls) < expected:
            # Returning non-None string since this may be a transient error.
            # Maybe the album was deleted but the ripper is working as expected.
            return 'expected at least %d images, got %d. url: %s' % (
                expected, len(urls), url)

        # Returning None because the ripper is working as expected. No issues found.
        return None
Пример #10
0
def search_vid_url(query, distance, frame_count):
    if ' ' in query:
        query = query.replace(' ', '%20')

    try:
        video_id = db.get_video_from_url(url=query)

        if not video_id:
            # Download video
            web = Httpy()
            video_buffer = web.download(url=query)
            if not video_buffer:
                raise Exception('unable to download video at %s' % query)

            try:
                frames, info = info_from_video_buffer(video_buffer, os.path.splitext(query)[1][1:])
            except:
                raise Exception("Could not identify video")

            videos = db.get_similar_videos_by_hash(frames, distance, frame_count)

        else:

            hashes = db.get_video_hashes(video_id)
            videos = db.get_similar_videos_by_hash(hashes, distance, frame_count)

        results = SearchResults(db.build_results_for_videos(videos))

    except Exception as e:
        return Response(json.dumps({'error': str(e)}), mimetype="application/json")

    return Response(results.json(), mimetype="application/json")
Пример #11
0
	def test():
		'''
			Test that ripper is working as expected.
			StatusManager.py uses the results of this method to show what rippers are working/broken on the main page

			Returns:
				None - if ripper is working as expected
				str - Warning message if the ripper may not be working properly.

			Raises:
				Exception - if ripper is definitely broken. Exception message is used to display on site.
		'''
		from Httpy import Httpy
		httpy = Httpy()

		# Check we can hit the host
		url = 'http://hostname.com'
		r = httpy.get(url)
		if len(r.strip()) == 0:
			# Raise exception because the site is *very* broken, definitely can't rip from it if we can't hit the home page.
			raise Exception('unable to retrieve data from %s' % url)

		# Check ripper gets all images in an album
		url = _SampleSite.get_sample_url()
		s = _SampleSite(url)
		urls = s.get_urls()
		expected = 10
		if len(urls) < expected:
			# Returning non-None string since this may be a transient error.
			# Maybe the album was deleted but the ripper is working as expected.
			return 'expected at least %d images, got %d. url: %s' % (expected, len(urls), url)

		# Returning None because the ripper is working as expected. No issues found.
		return None
Пример #12
0
    def test():
        '''
			Test that ripper is working as expected.
			Raise exception if necessary.
		'''
        from Httpy import Httpy
        httpy = Httpy()

        # Check we can hit the host
        url = 'http://8muses.com/'
        r = httpy.get(url)
        if len(r.strip()) == 0:
            raise Exception('unable to retrieve data from %s' % url)

        # Check ripper gets all images in an album
        url = 'http://www.8muses.com/index/category/hotassneighbor7'
        s = Site8muses(url)
        urls = s.get_urls()
        for (i, url) in enumerate(urls):
            print i, url
        expected = 21
        if len(urls) != expected:
            return 'expected %d images, got %d. url: %s' % (expected,
                                                            len(urls), url)
        return None
Пример #13
0
	def get_image_count_for_album(url):
		url = url.replace('m.imgur.com', 'imgur.com').replace('https://', '').replace('http://', '')
		aid = url.split('/')[2]
		url = 'http://imgur.com/a/%s/noscript' % aid
		httpy = Httpy()
		r = httpy.get(url)
		return r.count('src="//i.imgur.com')
Пример #14
0
	def test():
		from Httpy import Httpy
		httpy = Httpy()
		try:
			r = httpy.get('http://www.vimeo.com/')
			if len(r.strip()) == 0:
				raise Exception('empty response from vimeo.com')
		except Exception, e:
			raise e
Пример #15
0
 def test():
     from Httpy import Httpy
     httpy = Httpy()
     try:
         r = httpy.get('http://www.vimeo.com/')
         if len(r.strip()) == 0:
             raise Exception('empty response from vimeo.com')
     except Exception, e:
         raise e
Пример #16
0
	def get_urls(self):
		from Httpy import Httpy
		httpy = Httpy()

		r = httpy.get(self.url)
		result = []
		for link in httpy.between(r, '/img.php?path=', '"'):
			result.append(link)
		return result
Пример #17
0
	def sanitize_url(self):
		if '/image.php?id=' in self.url:
			from Httpy import Httpy
			httpy = Httpy()
			r = httpy.get(self.url)
			if not 'View complete gallery: <a href="' in r:
				raise Exception('no gallery found at %s' % self.url)
			self.url = 'http://imagearn.com/%s' % httpy.between(r, 'View complete gallery: <a href="', '"')[0]
		if not '/gallery.php?id=' in self.url:
			raise Exception('expected /gallery.php?id= not found in URL')
Пример #18
0
	def sanitize_url(self):
		if '/image/' in self.url:
			from Httpy import Httpy
			httpy = Httpy()
			r = httpy.get(self.url)
			if not "class='gallery_title'><a href='" in r:
				raise Exception('no gallery found at %s' % self.url)
			self.url = httpy.between(r, "class='gallery_title'><a href='", "'")[0]
		if not '/gallery/' in self.url:
			raise Exception('expected /gallery/ not found in URL')
		if not self.url.endswith('/'): self.url += '/'
Пример #19
0
	def get_urls(self):
		from Httpy import Httpy
		httpy = Httpy()

		r = httpy.get(self.url)
		result = []
		for post in httpy.between(r, 'daposts">', '</div> </div> </div>'):
			images = httpy.between(post, 'href="', '"')
			if len(images) > 0 and 'javascript:' not in images[0]:
				result.append('http://www.chansluts.com%s' % images[0])
		return result
Пример #20
0
    def get_urls(self):
        from Httpy import Httpy
        httpy = Httpy()

        r = httpy.get(self.url)
        result = []
        for post in httpy.between(r, 'daposts">', '</div> </div> </div>'):
            images = httpy.between(post, 'href="', '"')
            if len(images) > 0 and 'javascript:' not in images[0]:
                result.append('http://www.chansluts.com%s' % images[0])
        return result
Пример #21
0
 def sanitize_url(self):
     if '/image.php?id=' in self.url:
         from Httpy import Httpy
         httpy = Httpy()
         r = httpy.get(self.url)
         if not 'View complete gallery: <a href="' in r:
             raise Exception('no gallery found at %s' % self.url)
         self.url = 'http://imagearn.com/%s' % httpy.between(
             r, 'View complete gallery: <a href="', '"')[0]
     if not '/gallery.php?id=' in self.url:
         raise Exception('expected /gallery.php?id= not found in URL')
Пример #22
0
 def sanitize_url(self):
     if '/image/' in self.url:
         from Httpy import Httpy
         httpy = Httpy()
         r = httpy.get(self.url)
         if not "class='gallery_title'><a href='" in r:
             raise Exception('no gallery found at %s' % self.url)
         self.url = httpy.between(r, "class='gallery_title'><a href='",
                                  "'")[0]
     if not '/gallery/' in self.url:
         raise Exception('expected /gallery/ not found in URL')
     if not self.url.endswith('/'): self.url += '/'
Пример #23
0
    def get_urls(self):
        from threading import Thread
        from time import sleep
        from Httpy import Httpy
        httpy = Httpy()

        # Sign in so we can get restricted content
        self.flickr_signin()

        r = httpy.get(self.url)
        self.result = []
        index = 0
        while True:
            for link in self.httpy.between(
                    r, '><a data-track="photo-click" href="', '"'):
                if link == '{{photo_url}}': continue
                link = 'http://www.flickr.com%s' % link
                while not link.endswith('/'):
                    link += '/'
                link += 'sizes/o/'  # Default to 'original' size

                # Find and download image at this page
                while len(self.threads) >= self.max_threads:
                    sleep(0.1)  # Wait for threads
                self.threads.append(None)
                t = Thread(target=self.get_url_from_page, args=(
                    link,
                    index,
                ))
                t.start()
                index += 1
                if len(self.result) + len(
                        self.threads) > self.MAX_IMAGES_PER_RIP:
                    break

            if len(self.result) + len(self.threads) > self.MAX_IMAGES_PER_RIP:
                break

            # Look for 'next' button
            if 'data-track="next" href="' in r:
                nextpage = self.httpy.between(r, 'data-track="next" href="',
                                              '"')[0]
                if not 'flickr.com' in nextpage:
                    nextpage = 'http://flickr.com%s' % nextpage
                r = self.httpy.get(nextpage)
            else:
                # No more pages, we're done
                break

        # Wait for threads to finish
        while len(self.threads) > 0:
            sleep(0.1)
        return self.result
Пример #24
0
	def get_urls(self):
		from Httpy import Httpy
		httpy = Httpy()

		fields = self.url.split('/')
		url = 'http://api.4chan.org/%s/res/%s.json' % (fields[3], fields[5])
		try:
			r = httpy.get(url)
			json = loads(r)
			posts = json['posts']
		except Exception, e:
			raise Exception('failed to load %s: %s' % (url, str(e)))
Пример #25
0
    def get_urls(self):
        from Httpy import Httpy
        httpy = Httpy()

        fields = self.url.split('/')
        url = 'http://api.4chan.org/%s/res/%s.json' % (fields[3], fields[5])
        try:
            r = httpy.get(url)
            json = loads(r)
            posts = json['posts']
        except Exception, e:
            raise Exception('failed to load %s: %s' % (url, str(e)))
Пример #26
0
	def get_urls(self):
		from Httpy import Httpy
		httpy = Httpy()
		r = httpy.get(self.url)
		r = r[r.find('showMoreGalleries'):] # To ignore user icon
		links = httpy.between(r, 'border=0 src="', '"')
		result = []
		for link in links:
			link = 'http://%s' % link[link.find('.')+1:].replace('/images/thumb/', '/images/full/')
			result.append(link)
			if len(result) > SiteBase.MAX_IMAGES_PER_RIP:
				break
		return result
Пример #27
0
 def get_urls(self):
     from Httpy import Httpy
     httpy = Httpy()
     r = httpy.get(self.url)
     r = r[r.find('showMoreGalleries'):]  # To ignore user icon
     links = httpy.between(r, 'border=0 src="', '"')
     result = []
     for link in links:
         link = 'http://%s' % link[link.find('.') + 1:].replace(
             '/images/thumb/', '/images/full/')
         result.append(link)
         if len(result) > SiteBase.MAX_IMAGES_PER_RIP:
             break
     return result
Пример #28
0
	def get_urls(self):
		'''
			Returns list of URLs from album. Does not download them.
		'''
		from Httpy import Httpy
		httpy = Httpy()

		r = httpy.get(self.url)
		result = []
		for link in httpy.between(r, '<img src="', '"'):
			link = 'http://hostname.com%s' % link
			result.append(link)
			if len(result) > SiteBase.MAX_IMAGES_PER_RIP:
				break
		return result
Пример #29
0
	def searchImages(unfiltered_search_text, start_index, source_ip='127.0.0.1', safe='off'):
		search_text = unfiltered_search_text.replace(' ', '%20')

		url = 'https://ajax.googleapis.com/ajax/services/search/images?v=1.0'
		url +=      '&q=%s' % search_text.replace(' ', '%20')
		url +=  '&start=%d' % start_index
		url += '&userip=%s' % source_ip
		url +=   '&safe=%s' % safe

		from Httpy import Httpy
		httpy = Httpy()
		try:
			response = httpy.get(url)
		except Exception, e:
			raise e
Пример #30
0
    def get_urls(self):
        '''
			Returns list of URLs from album. Does not download them.
		'''
        from Httpy import Httpy
        httpy = Httpy()

        r = httpy.get(self.url)
        result = []
        for link in httpy.between(r, '<img src="', '"'):
            link = 'http://hostname.com%s' % link
            result.append(link)
            if len(result) > SiteBase.MAX_IMAGES_PER_RIP:
                break
        return result
Пример #31
0
    def get_urls(self):
        from Httpy import Httpy
        httpy = Httpy()

        r = httpy.get(self.url)
        result = []
        for link in httpy.between(r, 'src="', '"'):
            if not 'http://' in link: continue
            if not 'imgur.com' in link: continue
            doti = link.rfind('.') - 1
            if link[doti] == 'm':
                link = link.replace(link[doti:], link[doti + 1:])
            result.append(link)
            if len(result) > SiteBase.MAX_IMAGES_PER_RIP:
                break
        return result
Пример #32
0
	def get_urls(self):
		from Httpy import Httpy
		httpy = Httpy()

		r = httpy.get(self.url)
		chunks = httpy.between(r, '<article class="', '</article>')
		if len(chunks) == 0:
			raise Exception('unable to find "article class" at %s '% self.url)
		r = chunks[0]
		result = []
		for link in httpy.between(r, '<a href="', '"'):
			if link.startswith('//'):
				link = 'http:%s' % link
			link = link.replace(' ', '%20')
			result.append(link)
		return result
Пример #33
0
	def get_urls(self):
		from Httpy import Httpy
		httpy = Httpy()

		r = httpy.get(self.url)
		result = []
		for link in httpy.between(r, 'data-cfsrc="', '"'):
			if link.startswith('//'):
				link = 'http:%s' % link
			link = link.replace(' ', '%20')
			if '-cu_' in link:
				temp = link[:link.find('-cu_')]
				temp = '%s-me.%s' % (temp, link.split('.')[-1])
				link = temp
			result.append(link)
		return result
Пример #34
0
    def __init__(self, url):
        if not self.can_rip(url):
            # Don't instantiate if we can't rip it
            raise Exception('ripper (%s) cannot rip URL (%s)' %
                            (self.__class__.__name__, url))
        self.url = url
        self.sanitize_url()
        self.album_name = self.get_album_name()
        self.db = DB()
        self.httpy = Httpy()
        self.max_threads = self.MAX_THREADS
        self.threads = []

        self.album_id = self.db.select_one(
            'rowid', 'albums', 'host = ? and name = ?',
            [self.get_host(), self.get_album_name()])
        self.path = self.db.select_one(
            'path', 'albums', 'host = ? and name = ?',
            [self.get_host(), self.get_album_name()])

        if self.path == None:
            # Album does not exist.
            self.album_exists = False
            self.path = '%s_%s' % (self.get_host(), self.album_name)
        else:
            # Album already exists
            self.album_exists = True
Пример #35
0
	def get_urls_album(url):
		'''
			Requires URL in the format: http://imgur.com/a/[albumid]
		'''
		from Httpy import Httpy
		httpy = Httpy()

		try:
			r = httpy.get('http://api.imgur.com/2/album/%s.json' % url.split('/')[-1])
			json = loads(r)
			if 'error' in json:
				# Error, fall back to noscript method
				raise Exception(error)
		except Exception, e:
			# Got exception, fall back to noscript method
			return SiteImgur.get_urls_album_noscript(url)
Пример #36
0
    def get_urls(self):
        from Httpy import Httpy
        httpy = Httpy()

        r = httpy.get(self.url)
        result = []
        for link in httpy.between(r, 'data-cfsrc="', '"'):
            if link.startswith('//'):
                link = 'http:%s' % link
            link = link.replace(' ', '%20')
            if '-cu_' in link:
                temp = link[:link.find('-cu_')]
                temp = '%s-me.%s' % (temp, link.split('.')[-1])
                link = temp
            result.append(link)
        return result
Пример #37
0
	def get_urls(self):
		from Httpy import Httpy
		httpy = Httpy()

		r = httpy.get(self.url)
		result = []
		for link in httpy.between(r, 'src="', '"'):
			if not 'http://' in link: continue
			if not 'imgur.com' in link: continue
			doti = link.rfind('.')-1
			if link[doti] == 'm':
				link = link.replace(link[doti:], link[doti+1:])
			result.append(link)
			if len(result) > SiteBase.MAX_IMAGES_PER_RIP:
				break
		return result
Пример #38
0
	def get_urls(self):
		from threading import Thread
		from time import sleep
		from Httpy import Httpy
		httpy = Httpy()

		# Sign in so we can get restricted content
		self.flickr_signin()

		r = httpy.get(self.url)
		self.result = []
		index = 0
		while True:
			for link in self.httpy.between(r, '><a data-track="photo-click" href="', '"'):
				if link == '{{photo_url}}': continue
				link = 'http://www.flickr.com%s' % link
				while not link.endswith('/'):
					link += '/'
				link += 'sizes/o/' # Default to 'original' size

				# Find and download image at this page
				while len(self.threads) >= self.max_threads:
					sleep(0.1) # Wait for threads
				self.threads.append(None)
				t = Thread(target=self.get_url_from_page, args=(link,index,))
				t.start()
				index += 1
				if len(self.result) + len(self.threads) > self.MAX_IMAGES_PER_RIP:
					break

			if len(self.result) + len(self.threads) > self.MAX_IMAGES_PER_RIP:
				break

			# Look for 'next' button
			if 'data-track="next" href="' in r:
				nextpage = self.httpy.between(r, 'data-track="next" href="', '"')[0]
				if not 'flickr.com' in nextpage:
					nextpage = 'http://flickr.com%s' % nextpage
				r = self.httpy.get(nextpage)
			else:
				# No more pages, we're done
				break

		# Wait for threads to finish
		while len(self.threads) > 0:
			sleep(0.1)
		return self.result
Пример #39
0
	def sanity_check(db, spamtype, spamtext):
		'''
			Ensures the spam filter is not malicious.
			Raises:
				Exception if filter is malicious and should not be added.
		'''
		spamtext = spamtext.lower()
		whitelist = [
				# URLS
				'http://reddit.com/r/',
				'http://reddit.com/comments/',
				'http://www.reddit.com/r/',
				'http://www.reddit.com/comments',
				'http://imgur.com/',
				'http://imgur.com/a/',
				'http://i.imgur.com/',
				'http://www.imgur.com/',
				'http://www.imgur.com/a/',
				'http://i.rarchives.com/'
				'http://www.rarchives.com/',
				'http://rip.rarchives.com/',
				# TEXT - TODO Get a better text whitelist
				'''the quick brown fox jumped over the lazy dog'''
			]
		if spamtype == 'link' or spamtype == 'text':
			if len(spamtext) <= 3:
				raise Exception('[**!**] `%s` filter "`%s`" was **not** added because it is not long enough (must be more than 3 characters long).\n\n' % (spamtype, spamtext))
			for whitelisted in whitelist:
				if spamtext in whitelisted.lower():
					raise Exception('[**!**] `%s` filter "`%s`" was **not** added because it might remove relevant posts/comments (e.g. `%s...`).\n\n' % (spamtype, spamtext, whitelisted))

		elif spamtype == 'tld':
			if spamtext in ['com', 'net', 'org']:
				raise Exception('[**!**] TLD `%s` was **not** added because it might remove relevant links (e.g. `.com` or `.net` or `.org`).\n\n' % spamtext)

		elif spamtype == 'user':
			if db.count('admins', 'username like ?', [spamtext]) > 0:
				raise Exception('[**!**] User `%s` was **not** added because you cannot add an admin to the spam filter\n\n' % spamtext)

		elif spamtype == 'thumb':
			# To validate the thumb-spam filter, load a non-spam imgur album and test the filter on that
			httpy = Httpy()
			unicode_resp = httpy.get('http://imgur.com/a/RdXNa')
			r = unicode_resp.decode('UTF-8').encode('ascii', 'ignore')
			if spamtext in r:
				raise Exception('[**!**] Thumb-spam filter `%s` was **not** added because the bot detected a false-positive (non-spam imgur albums would be detected as spam).\n\n' % spamtext)
Пример #40
0
    def get_urls_album(url):
        '''
			Requires URL in the format: http://imgur.com/a/[albumid]
		'''
        from Httpy import Httpy
        httpy = Httpy()

        try:
            r = httpy.get('http://api.imgur.com/2/album/%s.json' %
                          url.split('/')[-1])
            json = loads(r)
            if 'error' in json:
                # Error, fall back to noscript method
                raise Exception(error)
        except Exception, e:
            # Got exception, fall back to noscript method
            return SiteImgur.get_urls_album_noscript(url)
Пример #41
0
	def get_urls_album_noscript(url):
		'''
			Requires URL in the format: http://imgur.com/a/[albumid]
		'''
		from Httpy import Httpy
		httpy = Httpy()
		r = httpy.get('%s/noscript' % url)
		result = []
		for link in httpy.between(r, 'img src="//i.', '"'):
			link = 'http://i.%s' % link
			try:
				link = self.get_highest_res(link)
			except Exception, e:
				# Image is gone.
				# Add it anyway so RipManager will mark the image as 'errored'
				pass
			result.append(link)
Пример #42
0
	def test():
		from Httpy import Httpy
		httpy = Httpy()

		# Check we can hit the host
		url = 'http://deviantart.com'
		r = httpy.get(url)
		if len(r.strip()) == 0:
			raise Exception('unable to retrieve data from %s' % url)

		url = 'http://www.imagefap.com/pictures/3802288/asdf'
		s = SiteImagefap(url)
		urls = s.get_urls()
		expected = 10
		if len(urls) != expected:
			return 'expected %d images, got %d. url: %s' % (expected, len(urls), url)
		return None
Пример #43
0
	def get_urls(self):
		from Httpy import Httpy
		httpy = Httpy()

		r = httpy.get(self.url)
		result = []
		already_got = []
		while True:
			for chunk in httpy.between(r, '<a class="thumb', '>'):
				if not 'href="' in chunk: continue
				link = httpy.between(chunk, 'href="', '"')[0]
				if link in already_got:
					continue
				already_got.append(link)
				# Get image from page
				while len(self.threads) >= self.max_threads:
					sleep(0.1)
				self.threads.append(None)
				t = Thread(target=self.get_url_from_page, args=(httpy, result, link,))
				t.start()
			# Go to next page
			nexts = httpy.between(r, '<li class="next">', '</li>')
			if len(nexts) == 0 or not 'href"' in nexts[0]:
				break
			next_page = httpy.between(nexts[0], 'href="', '"')[0]
			if not 'offset=' in next_page:
				break
			r = httpy.get(next_page)
		while len(self.threads) > 0:
			sleep(0.1)
		return result
Пример #44
0
    def test():
        from Httpy import Httpy
        httpy = Httpy()

        # Check we can hit the host
        url = 'http://deviantart.com'
        r = httpy.get(url)
        if len(r.strip()) == 0:
            raise Exception('unable to retrieve data from %s' % url)

        url = 'http://www.imagefap.com/pictures/3802288/asdf'
        s = SiteImagefap(url)
        urls = s.get_urls()
        expected = 10
        if len(urls) != expected:
            return 'expected %d images, got %d. url: %s' % (expected,
                                                            len(urls), url)
        return None
Пример #45
0
    def searchImages(unfiltered_search_text,
                     start_index,
                     source_ip='127.0.0.1',
                     safe='off'):
        search_text = unfiltered_search_text.replace(' ', '%20')

        url = 'https://ajax.googleapis.com/ajax/services/search/images?v=1.0'
        url += '&q=%s' % search_text.replace(' ', '%20')
        url += '&start=%d' % start_index
        url += '&userip=%s' % source_ip
        url += '&safe=%s' % safe

        from Httpy import Httpy
        httpy = Httpy()
        try:
            response = httpy.get(url)
        except Exception, e:
            raise e
Пример #46
0
    def __init__(self, user='', password='', user_agent=None):
        """
            Initializes instance fields, sets user agent.
            Logs into reddit if user and password are given.
        """

        # Default user agent is awesome!
        if user_agent is None:
            user_agent = 'ReddiWrap'

        # Create object we will use to communicate with reddit's servers
        self.web = Httpy(user_agent=user_agent)

        self.modhash = ''  # Hash used to authenticate/interact with user account
        self.last_url = ''  # The last URL retrieved
        self.before = None  # ID pointing to 'previous' page
        self.after = None  # ID pointing to 'next' page
        self.logged_in = False  # Flag to detect if we are logged in or not
Пример #47
0
class InstagramWrapper:
    httpy = Httpy()
    CLIENT_ID = 'ada2177105f94b05b21c3839c21d3794'

    @staticmethod
    def get_user_id(username):
        url = 'https://api.instagram.com/v1/users/search?q=%s' % username
        url += '&client_id=%s' % InstagramWrapper.CLIENT_ID
        json = loads(InstagramWrapper.httpy.get(url))
        users = json['data']
        for user in users:
            if user['username'] == username:
                return user['id']
        raise Exception("Username '%s' not found" % username)

    @staticmethod
    def get_posts(user_id, max_id=None, min_id=None):
        url = 'https://api.instagram.com/v1/users/%s/media/recent/' % user_id
        url += '?client_id=%s' % InstagramWrapper.CLIENT_ID
        if max_id:
            url += '&max_id=%s' % max_id
        if min_id:
            url += '&min_id=%s' % min_id
        json = loads(InstagramWrapper.httpy.get(url))
        results = []
        for post in json['data']:
            result = {
                'id': post['id'],
                'likes': post['likes']['count'],
                'images': post['images'],
                'link': post['link'],
                'tags': post['tags'],
                'type': post['type'],
                'created': post['created_time'],
            }
            if 'caption' in post and post[
                    'caption'] != None and 'text' in post['caption']:
                result['caption'] = post['caption']['text']
            if post['type'] == 'video' and 'videos' in post:
                result['videos'] = post['videos']
            results.append(result)
        return results

    @staticmethod
    def get_user_info(user_id):
        url = 'https://api.instagram.com/v1/users/%s' % user_id
        url += '?client_id=%s' % InstagramWrapper.CLIENT_ID
        json = loads(InstagramWrapper.httpy.get(url))
        data = json['data']
        return {
            'bio': data['bio'],
            'website': data['website'],
            'profile_picture': data['profile_picture'],
            'full_name': data['full_name'],
            'total_media': data['counts']['media']
        }
Пример #48
0
    def get_urls(self):
        self.api_key = self.db.get_config('tumblr_key')
        if self.api_key == None:
            raise Exception(
                'unable to rip album (%s), tumblr key not found in database' %
                self.url)

        from Httpy import Httpy
        httpy = Httpy()

        result = []
        for posttype in ['photo', 'video']:
            offset = 0
            while True:
                url = self.get_api_url(offset=offset, posttype=posttype)
                r = httpy.get(url)
                json = None
                try:
                    json = loads(r)
                except:
                    pass
                if json == None or 'response' not in json or 'posts' not in json[
                        'response']:
                    #raise Exception('no posts found at %s' % self.url)
                    break

                posts = json['response']['posts']
                if len(posts) == 0: break

                for post in posts:
                    if 'photos' in post:
                        for photos in post['photos']:
                            result.append(photos['original_size']['url'])
                    elif 'video_url' in post:
                        result.append(post['video_url'])
                if self.post_type == 'post': break
                if len(result) > SiteBase.MAX_IMAGES_PER_RIP:
                    break
                offset += 20
                sleep(1)
            if len(result) > SiteBase.MAX_IMAGES_PER_RIP:
                break
        return result
Пример #49
0
	def get_urls_subreddit(self):
		from Httpy import Httpy
		httpy = Httpy()

		page = 0
		result = []
		while True:
			r = httpy.get('%s/page/%d' % (self.url, page))
			links = httpy.between(r, ' src="//i.', '"')
			if len(links) == 0:
				# Hit end of pages
				return result
			for link in links:
				if link in result:
					# Pages started repeating
					return result
				link = self.get_highest_res(link)
				result.append(link)
			page += 1
Пример #50
0
    def get_urls_album_noscript(url):
        '''
			Requires URL in the format: http://imgur.com/a/[albumid]
		'''
        from Httpy import Httpy
        httpy = Httpy()
        r = httpy.get('%s/noscript' % url)
        result = []
        for link in httpy.between(r, 'img src="//i.', '"'):
            link = 'http://i.%s' % link
            try:
                link = self.get_highest_res(link)
            except Exception, e:
                # Image is gone.
                # Add it anyway so RipManager will mark the image as 'errored'
                pass
            result.append({'url': link, 'saveas': link[link.rfind('/') + 1:]})
            if len(result) > SiteBase.MAX_IMAGES_PER_RIP:
                break
Пример #51
0
	def test():
		from Httpy import Httpy
		httpy = Httpy()

		# Check we can hit the host
		url = 'http://xhamster.com'
		r = httpy.get(url)
		if len(r.strip()) == 0:
			raise Exception('unable to retrieve data from %s' % url)

		# Check ripper gets all images in an album
		url = SiteXhamster.get_sample_url()
		s = SiteXhamster(url)
		urls = s.get_urls()

		expected = 10
		if len(urls) < expected:
			return 'expected at least %d images, got %d. url: %s' % (expected, len(urls), url)
		return None
Пример #52
0
    def test():
        from Httpy import Httpy
        httpy = Httpy()

        # Check we can hit the host
        url = 'http://xhamster.com'
        r = httpy.get(url)
        if len(r.strip()) == 0:
            raise Exception('unable to retrieve data from %s' % url)

        # Check ripper gets all images in an album
        url = SiteXhamster.get_sample_url()
        s = SiteXhamster(url)
        urls = s.get_urls()

        expected = 10
        if len(urls) < expected:
            return 'expected at least %d images, got %d. url: %s' % (
                expected, len(urls), url)
        return None
Пример #53
0
	def get_urls(self):
		from Httpy import Httpy
		httpy = Httpy()

		result = []
		page = 1
		r = httpy.get(self.url)
		while True:
			for chunk in httpy.between(r, "class='slideTool'", 'Related Galleries'):
				for link in httpy.between(chunk, "' src='", "'"):
					link = link.replace('_160.', '_1000.').replace('http://p2.', 'http://up.')
					result.append(link)
				break
			page += 1
			next_page = self.url.replace('.html', '-%d.html' % page)
			if next_page in r:
				r = httpy.get(next_page)
			else:
				break
		return result
Пример #54
0
	def get_urls(self):
		from Httpy import Httpy
		httpy = Httpy()

		url = self.url
		result = []
		while True:
			r = httpy.get(url)
			for chunk in httpy.between(r, '<a name="', '</li>'):
				if not '<img src="' in chunk: continue
				image = httpy.between(chunk, '<img src="', '"')[0]
				image = image.replace('_stream', '_max')
				if image.startswith('//'):
					image = 'http:%s' % image
				result.append(image)
			if '<li class="next"><a href="' in r:
				url = httpy.between(r, '<li class="next"><a href="', '"')[0]
			else:
				break
		return result
Пример #55
0
 def __init__(self):
     self.db = DB(DBFILE, **SCHEMA)
     self.web = Httpy()
     self._rabbitmq = pika.BlockingConnection(
         pika.ConnectionParameters(host='localhost'))
     self._rabbitmq_channel = self._rabbitmq.channel()
     self._rabbitmq_channel.exchange_declare(exchange='reddit',
                                             exchange_type='topic')
     self._rabbitmq_queue = self._rabbitmq_channel.queue_declare(
         '', exclusive=True)
     self._q = Queue()
Пример #56
0
 def _message_callback_worker(self):
     logger.info("Started message callback worker")
     web = Httpy()
     while True:
         try:
             body = self._q.get()
             self._message_callback(body, web)
         except Exception as e:
             logger.error(e)
         finally:
             self._q.task_done()
Пример #57
0
    def get_urls_subreddit(self):
        from Httpy import Httpy
        httpy = Httpy()

        page = 0
        result = []
        while True:
            r = httpy.get('%s/page/%d' % (self.url, page))
            links = httpy.between(r, ' src="//i.', '"')
            if len(links) == 0:
                # Hit end of pages
                return result
            for link in links:
                if link in result:
                    # Pages started repeating
                    return result
                link = self.get_highest_res(link)
                result.append(link)
            if len(result) > SiteBase.MAX_IMAGES_PER_RIP:
                break
            page += 1
Пример #58
0
	def get_urls_user_images(self):
		from Httpy import Httpy
		httpy = Httpy()

		result = []
		url = self.url.replace('/all', '')
		page = total = index = 0
		while True:
			page += 1
			next_page = '%s/ajax/images?sort=0&order=1&album=0&page=%d&perPage=60' % (url, page)
			r = httpy.get(next_page)
			json = loads(r)
			data = json['data']
			if total == 0 and 'count' in data:
				total = data['count']
			# TODO report progress
			for image in data['images']:
				result.append('http://i.imgur.com/%s%s' % (image['hash'], image['ext']))
			if index >= total or self.hit_image_limit(): break
			sleep(1)
		return result
Пример #59
0
	def get_urls_album_noscript(url):
		'''
			Requires URL in the format: http://imgur.com/a/[albumid]
		'''
		from Httpy import Httpy
		httpy = Httpy()
		r = httpy.get('%s/noscript' % url)
		result = []
		for link in httpy.between(r, 'img src="//i.', '"'):
			link = 'http://i.%s' % link
			try:
				link = self.get_highest_res(link)
			except Exception, e:
				# Image is gone.
				# Add it anyway so RipManager will mark the image as 'errored'
				pass
			result.append({
				'url' : link,
				'saveas' : link[link.rfind('/')+1:]
			})
			if len(result) > SiteBase.MAX_IMAGES_PER_RIP:
				break
Пример #60
0
	def test():
		'''
			Test that ripper is working as expected.
			Raise exception if necessary.
		'''
		from Httpy import Httpy
		httpy = Httpy()

		# Check we can hit the host
		url = 'http://8muses.com/'
		r = httpy.get(url)
		if len(r.strip()) == 0:
			raise Exception('unable to retrieve data from %s' % url)

		# Check ripper gets all images in an album
		url = 'http://www.8muses.com/index/category/hotassneighbor7'
		s = Site8muses(url)
		urls = s.get_urls()
		expected = 21
		if len(urls) != expected:
			return 'expected %d images, got %d. url: %s' % (expected, len(urls), url)
		return None