Exemplo n.º 1
0
	def convertToReaderUrl(self, inUrl, resource=False):
		inUrl = urlFuncs.urlClean(inUrl)
		inUrl = self.preprocessReaderUrl(inUrl)
		# The link will have been canonized at this point

		# Do not relink inline images
		if inUrl.startswith("data:"):
			return inUrl

		# or links that are NOP()ed with javascript
		if inUrl.startswith("javascript:void(0);"):
			return inUrl


		# Fix protocol-relative URLs
		if inUrl.startswith("//"):
			if hasattr(self, "pageUrl"):
				scheme = urllib.parse.urlsplit(self.pageUrl).scheme
			else:
				self.log.warning("No pageUrl member variable? Guessing about the protocol type!")
				scheme = "http"
			inUrl = "{}:{}".format(scheme, inUrl)

		if resource:
			prefix = "RESOURCE:{}".format(config.relink_secret)
		else:
			prefix = "CONTENT:{}".format(config.relink_secret)
		url = '%s%s' % (prefix.lower(), urllib.parse.quote(inUrl))
		return url
Exemplo n.º 2
0
	def fetch(self):
		self.target_url = url_util.urlClean(self.target_url)


		content, fName, mimeType = self.getItem(self.target_url)

		return self.dispatchContent(content, fName, mimeType)
Exemplo n.º 3
0
	def convertToReaderUrl(self, inUrl, resource=False):
		inUrl = urlFuncs.urlClean(inUrl)
		inUrl = self.preprocessReaderUrl(inUrl)
		# The link will have been canonized at this point

		# Do not relink inline images
		if inUrl.startswith("data:"):
			return inUrl

		# or links that are NOP()ed with javascript
		if inUrl.startswith("javascript:void(0);"):
			return inUrl


		# Fix protocol-relative URLs
		if inUrl.startswith("//"):
			if hasattr(self, "pageUrl"):
				scheme = urllib.parse.urlsplit(self.pageUrl).scheme
			else:
				self.log.warning("No pageUrl member variable? Guessing about the protocol type!")
				scheme = "http"
			inUrl = "{}:{}".format(scheme, inUrl)

		if resource:
			prefix = "RESOURCE:{}".format(config.relink_secret)
		else:
			prefix = "CONTENT:{}".format(config.relink_secret)
		url = '%s%s' % (prefix.lower(), urllib.parse.quote(inUrl))
		return url
	def processLinkItem(self, url, baseUrl):
		url = urlFuncs.clearOutboundProxy(url)
		url = urlFuncs.clearBitLy(url)

		for badword in self._badwords:
			if badword in url:
				return


		url = urlFuncs.urlClean(url)

		if "google.com" in urllib.parse.urlsplit(url.lower()).netloc:
			url = urlFuncs.trimGDocUrl(url)

			if url.startswith('https://docs.google.com/document/d/images'):
				return

			# self.log.info("Resolved URL = '%s'", url)
			ret = self.processNewUrl(url, baseUrl)
			return ret
			# self.log.info("New G link: '%s'", url)

		else:
			# Remove any URL fragments causing multiple retreival of the same resource.
			if url != urlFuncs.trimGDocUrl(url):
				print('Old URL: "%s"' % url)
				print('Trimmed: "%s"' % urlFuncs.trimGDocUrl(url))
				raise ValueError("Wat? Url change? Url: '%s'" % url)
			ret = self.processNewUrl(url, baseUrl)
			# print("Returning:", ret)
			return ret
Exemplo n.º 5
0
    def fetch(self, preretrieved):

        if not preretrieved:
            self.target_url = url_util.urlClean(self.target_url)
            content, fName, mimeType = self.getItem(self.target_url)
        else:
            content, fName, mimeType = preretrieved

        return self.dispatchContent(content, fName, mimeType)
Exemplo n.º 6
0
	def processLinkItem(self, url, baseUrl):

		url = urlFuncs.cleanUrl(url)
		if not url:
			return None

		# F*****g tumblr redirects.
		if url.startswith("https://www.tumblr.com/login"):
			return None

		for badword in self._badwords:
			if badword in url:
				return

		for badword in self._badwords:
			if badword in url:
				return

		url = urlFuncs.urlClean(url)

		if "google.com" in urllib.parse.urlsplit(url.lower()).netloc:
			url = urlFuncs.trimGDocUrl(url)

			if url.startswith('https://docs.google.com/document/d/images'):
				return

			# self.log.info("Resolved URL = '%s'", url)
			ret = self.processNewUrl(url, baseUrl)
			return ret
			# self.log.info("New G link: '%s'", url)

		else:
			# Remove any URL fragments causing multiple retreival of the same resource.
			if url != urlFuncs.trimGDocUrl(url):
				print('Old URL: "%s"' % url)
				print('Trimmed: "%s"' % urlFuncs.trimGDocUrl(url))
				raise ValueError("Wat? Url change? Url: '%s'" % url)
			ret = self.processNewUrl(url, baseUrl)
			# print("Returning:", ret)
			return ret
Exemplo n.º 7
0
    def processImageLink(self, url, baseUrl):

        # Skip tags with `img src=""`.
        # No idea why they're there, but they are
        if not url:
            return

        # # Filter by domain
        # if not self.allImages and not any([base in url for base in self._fileDomains]):
        # 	return

        # # and by blocked words
        # hadbad = False
        # for badword in self._badwords:
        # 	if badword.lower() in url.lower():
        # 		hadbad = True
        # if hadbad:
        # 	return

        url = urlFuncs.urlClean(url)

        return self.processNewUrl(url, baseUrl=baseUrl, istext=False)
Exemplo n.º 8
0
	def processImageLink(self, url, baseUrl):

		# Skip tags with `img src=""`.
		# No idea why they're there, but they are
		if not url:
			return

		# # Filter by domain
		# if not self.allImages and not any([base in url for base in self._fileDomains]):
		# 	return

		# # and by blocked words
		# hadbad = False
		# for badword in self._badwords:
		# 	if badword.lower() in url.lower():
		# 		hadbad = True
		# if hadbad:
		# 	return


		url = urlFuncs.urlClean(url)

		return self.processNewUrl(url, baseUrl=baseUrl, istext=False)
Exemplo n.º 9
0
	def convertToReaderImage(self, inStr):
		inStr = urlFuncs.urlClean(inStr)
		return self.convertToReaderUrl(inStr, resource=True)
Exemplo n.º 10
0
	def convertToReaderUrl(self, inUrl):
		inUrl = urlFuncs.urlClean(inUrl)
		inUrl = self.preprocessReaderUrl(inUrl)
		# The link will have been canonized at this point
		url = '/books/render?url=%s' % urllib.parse.quote(inUrl)
		return url
Exemplo n.º 11
0
	def convertToReaderImage(self, inStr):
		inStr = urlFuncs.urlClean(inStr)
		return self.convertToReaderUrl(inStr, resource=True)