Пример #1
0
	def fetch(self):
		self.job.url = WebMirror.util.urlFuncs.urlClean(self.job.url)

		# print('Dispatch URL', url)

		netloc = urllib.parse.urlsplit(self.job.url.lower()).netloc

		isGdoc,  realUrl = gdp.isGdocUrl(self.job.url)
		isGfile, fileUrl = gdp.isGFileUrl(self.job.url)

		# print('Fetching: ', self.job.url, 'distance', self.job.distance)
		# print(isGdoc, isGfile)
		if 'drive.google.com' in netloc:
			self.log.info("Google Drive content!")
			response = self.extractGoogleDriveFolder(self.job)
		elif isGdoc:
			self.log.info("Google Docs content!")
			response = self.retreiveGoogleDoc(self.job, realUrl)
		elif isGfile:
			self.log.info("Google File content!")
			response = self.retreiveGoogleFile(self.job, realUrl)

		else:
			response = self.retreivePlainResource(self.job)

		if 'title' in response and 'contents' in response:
			self.job.title    = response['title']
			self.job.content  = response['contents']
			self.job.mimetype = 'text/html'
			self.job.is_text  = True
			self.job.state    = 'complete'


		return response
Пример #2
0
    def processNewUrl(self, url, baseUrl=None, istext=True):
        if not url.lower().startswith("http"):
            if baseUrl:
                # If we have a base-url to extract the scheme from, we pull that out, concatenate
                # it onto the rest of the url segments, and then unsplit that back into a full URL
                scheme = urllib.parse.urlsplit(baseUrl.lower()).scheme
                rest = urllib.parse.urlsplit(baseUrl.lower())[1:]
                params = (scheme, ) + rest

                # self.log.info("Had to add scheme (%s) to URL: '%s'", scheme, url)
                url = urllib.parse.urlunsplit(params)

            elif self.ignoreBadLinks:
                self.log.error("Skipping a malformed URL!")
                self.log.error("Bad URL: '%s'", url)
                return
            else:
                raise ValueError("Url isn't a url: '%s'" % url)
        if gdp.isGdocUrl(url) or gdp.isGFileUrl(url):
            if gdp.trimGDocUrl(url) != url:
                raise ValueError("Invalid link crept through! Link: '%s'" %
                                 url)

        if not url.lower().startswith('http'):
            raise ValueError("Failure adding scheme to URL: '%s'" % url)

        if not self.checkDomain(url) and istext:
            raise ValueError("Invalid url somehow got through: '%s'" % url)

        if '/view/export?format=zip' in url:
            raise ValueError("Wat?")
        return url
Пример #3
0
	def processNewUrl(self, url, baseUrl=None, istext=True):
		if not url.lower().startswith("http"):
			if baseUrl:
				# If we have a base-url to extract the scheme from, we pull that out, concatenate
				# it onto the rest of the url segments, and then unsplit that back into a full URL
				scheme = urllib.parse.urlsplit(baseUrl.lower()).scheme
				rest = urllib.parse.urlsplit(baseUrl.lower())[1:]
				params = (scheme, ) + rest

				# self.log.info("Had to add scheme (%s) to URL: '%s'", scheme, url)
				url = urllib.parse.urlunsplit(params)

			elif self.ignoreBadLinks:
				self.log.error("Skipping a malformed URL!")
				self.log.error("Bad URL: '%s'", url)
				return
			else:
				raise ValueError("Url isn't a url: '%s'" % url)
		if gdp.isGdocUrl(url) or gdp.isGFileUrl(url):
			if gdp.trimGDocUrl(url) != url:
				raise ValueError("Invalid link crept through! Link: '%s'" % url)


		if not url.lower().startswith('http'):
			raise ValueError("Failure adding scheme to URL: '%s'" % url)

		if not self.checkDomain(url) and istext:
			raise ValueError("Invalid url somehow got through: '%s'" % url)

		if '/view/export?format=zip' in url:
			raise ValueError("Wat?")
		return url
Пример #4
0
    def dispatchUrlRequest(self, url, pageDistance):

        url = TextScrape.urlFuncs.urlClean(url)
        # Snip off leading slashes that have shown up a few times.
        if url.startswith("//"):
            url = "http://" + url[2:]

            # print('Dispatch URL', url)

        netloc = urllib.parse.urlsplit(url.lower()).netloc

        isGdoc, realUrl = gdp.isGdocUrl(url)
        isGfile, fileUrl = gdp.isGFileUrl(url)

        # print('Fetching: ', url, 'distance', pageDistance)
        # print(isGdoc, isGfile)
        if "drive.google.com" in netloc:
            self.log.info("Google Drive content!")
            response = self.extractGoogleDriveFolder(url)
        elif isGdoc:
            self.log.info("Google Docs content!")
            response = self.retreiveGoogleDoc(realUrl)
        elif isGfile:
            self.log.info("Google File content!")
            response = self.retreiveGoogleFile(realUrl)

        else:
            response = self.retreivePlainResource(url)

        if "title" in response and "contents" in response:
            self.updateDbEntry(
                url=url,
                title=response["title"],
                contents=response["contents"],
                mimetype="text/html",
                dlstate=2,
                istext=True,
            )

        self.processResponse(response, pageDistance)
Пример #5
0
    def dispatchUrlRequest(self, url, pageDistance):

        url = TextScrape.urlFuncs.urlClean(url)
        # Snip off leading slashes that have shown up a few times.
        if url.startswith("//"):
            url = 'http://' + url[2:]

        # print('Dispatch URL', url)

        netloc = urllib.parse.urlsplit(url.lower()).netloc

        isGdoc, realUrl = gdp.isGdocUrl(url)
        isGfile, fileUrl = gdp.isGFileUrl(url)

        # print('Fetching: ', url, 'distance', pageDistance)
        # print(isGdoc, isGfile)
        if 'drive.google.com' in netloc:
            self.log.info("Google Drive content!")
            response = self.extractGoogleDriveFolder(url)
        elif isGdoc:
            self.log.info("Google Docs content!")
            response = self.retreiveGoogleDoc(realUrl)
        elif isGfile:
            self.log.info("Google File content!")
            response = self.retreiveGoogleFile(realUrl)

        else:
            response = self.retreivePlainResource(url)

        if 'title' in response and 'contents' in response:
            self.updateDbEntry(url=url,
                               title=response['title'],
                               contents=response['contents'],
                               mimetype='text/html',
                               dlstate=2,
                               istext=True)

        self.processResponse(response, pageDistance)