示例#1
0
    def base_request(self, method, container=None, name=None, prefix=None,
                     headers={}, proxy=None, contents=None, full_listing=None):
        # Common request method
        url = self.url

        if self.token:
            headers['X-Auth-Token'] = self.token

        if container:
            url = '%s/%s' % (url.rstrip('/'), quote(container))

        if name:
            url = '%s/%s' % (url.rstrip('/'), quote(name))

        url += '?format=json'

        if prefix:
            url += '&prefix=%s' % prefix

        if proxy:
            proxy = urlparse.urlparse(proxy)
            proxy = urllib2.ProxyHandler({proxy.scheme: proxy.netloc})
            opener = urllib2.build_opener(proxy)
            urllib2.install_opener(opener)

        req = urllib2.Request(url, headers=headers, data=contents)
        req.get_method = lambda: method
        urllib2.urlopen(req)
        conn = urllib2.urlopen(req)
        body = conn.read()
        try:
            body_data = json.loads(body)
        except ValueError:
            body_data = None
        return [None, body_data]
示例#2
0
    def base_request(self, method, container=None, name=None, prefix=None,
                     headers={}, proxy=None, contents=None, full_listing=None):
        # Common request method
        url = self.url

        if self.token:
            headers['X-Auth-Token'] = self.token

        if container:
            url = '%s/%s' % (url.rstrip('/'), quote(container))

        if name:
            url = '%s/%s' % (url.rstrip('/'), quote(name))

        url += '?format=json'

        if prefix:
            url += '&prefix=%s' % prefix

        if proxy:
            proxy = urlparse.urlparse(proxy)
            proxy = urllib2.ProxyHandler({proxy.scheme: proxy.netloc})
            opener = urllib2.build_opener(proxy)
            urllib2.install_opener(opener)

        req = urllib2.Request(url, headers=headers, data=contents)
        req.get_method = lambda: method
        urllib2.urlopen(req)
        conn = urllib2.urlopen(req)
        body = conn.read()
        try:
            body_data = json.loads(body)
        except ValueError:
            body_data = None
        return [None, body_data]
示例#3
0
	def main(self, start_url, block_extensions=['.pdf','.gif','.jpg','.JPG','.PNG','.png','.wav','.mp3','.wma'], max_urls = 100):

		# Set user agent string
		opener = urllib2.build_opener()
		opener.addheaders = [
			('User-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.220 Safari/535.1'),
			('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'),
			('Accept-Charset', 'utf-8,gbk;q=0.7,*;q=0.3'),
			#('Accept-Encoding', 'gzip,deflate,sdch'),
			('Accept-Language', 'en-US,en,en-zh;q=0.8'),
			#('Cache-Control', 'max-age=0'),
			#('Connection', 'keep-alive')
		]
		urllib2.install_opener(opener)

		# Get base info
		(scheme, netloc, path, params, query, fragment) = urlparse.urlparse(start_url)
		fragments = (scheme, netloc, '', '', '', '')
		base_url = urlparse.urlunparse(fragments)
		#print "base_url  -> ", base_url
		
		mainLink = LinkInfo(None,base_url,u'Main',0,u'first page')
		self.assignID(mainLink)
		

		urls_queue = set([mainLink])
		urls_crawled = set()
		urls_crawled2 = set()

		pool = eventlet.GreenPool(20)

		counter = 0
		tmpC = 0
		while True:
			#Infinite loop sanity check
			counter +=1
			if counter > max_urls:
				break

			for url, body in pool.imap(self.fetch, urls_queue):
				# Remove this url from the queue set
				urls_queue = urls_queue - set([url])

				# Add url to crawled set
				urls_crawled = urls_crawled.union(set([url]))
				urls_crawled2 = urls_crawled2.union(set([url]))

				# Extract links
				links = self.extract_links(url, body, block_extensions)
				if ( links == None ):return urls_crawled
				if tmpC == 100000 : return urls_crawled
				tmpC += 1
				for link in links:
					if link not in urls_queue and link not in urls_crawled:
						# Add link to queue
						urls_queue = urls_queue.union(set([link]))
						print u"[valid]: link -> ", link.link

		return urls_crawled
    def main(self, start_url, block_extensions=['.pdf'], max_urls = 100):

		# Set user agent string
		opener = urllib2.build_opener()
		opener.addheaders = [
			('User-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.220 Safari/535.1'),
			('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'),
			('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.3'),
			('Accept-Encoding', 'gzip,deflate,sdch'),
			('Accept-Language', 'en-US,en;q=0.8'),
			('Cache-Control', 'max-age=0'),
			('Connection', 'keep-alive')
		]
		urllib2.install_opener(opener)

		# Get base info
		(scheme, netloc, path, params, query, fragment) = urlparse.urlparse(start_url)
		fragments = (scheme, netloc, '', '', '', '')
		base_url = urlparse.urlunparse(fragments)

		urls_queue = set([base_url])
		urls_crawled = set()

		pool = eventlet.GreenPool(20)

		counter = 0
		while True:
			#Infinite loop sanity check
			counter +=1
			if counter > max_urls:
				break

			for url, body in pool.imap(self.fetch, urls_queue):
				# Remove this url from the queue set
				urls_queue = urls_queue - set([url])

				# Add url to crawled set
				urls_crawled = urls_crawled.union(set([url]))

				# Extract links
				links = self.extract_links(url, body, block_extensions)
				for link in links:
					if link not in urls_queue and link not in urls_crawled:
						# Add link to queue
						urls_queue = urls_queue.union(set([link]))

		return urls_crawled
示例#5
0
    def proxyTest(self, row):
        proxy = row[0] + ":" + row[1]
        if 'HTTPS' in row[3]:
            proxies = {"https": "https://" + proxy}
        else:
            proxies = {"http": "http://" + proxy}
        ip = row[0]
        port = row[1]

        theProxy = urllib2.ProxyHandler(proxies)
        opener = urllib2.build_opener(theProxy)
        urllib2.install_opener(opener)
        testResult = 'ok!'
        try:
            webcode = urllib2.urlopen("https://www.fliggy.com/",
                                      timeout=10).getcode()
            #logger.info("Proxy %s is ok" % proxy)
        except Exception, e:
            #logger.warn("Proxy %s is nolonger ok" % proxy)
            self.clean(ip=ip, port=port)
            testResult = 'nolonger ok!'
示例#6
0
    def base_request(self, method, container=None, name=None, prefix=None,
                     headers=None, proxy=None, contents=None,
                     full_listing=None, logger=None, additional_info=None):
        # Common request method
        trans_start = time()
        url = self.url

        if headers is None:
            headers = {}

        if self.token:
            headers['X-Auth-Token'] = self.token

        if container:
            url = '%s/%s' % (url.rstrip('/'), quote(container))

        if name:
            url = '%s/%s' % (url.rstrip('/'), quote(name))
        else:
            url += '?format=json'
            if prefix:
                url += '&prefix=%s' % prefix

        if proxy:
            proxy = urlparse.urlparse(proxy)
            proxy = urllib2.ProxyHandler({proxy.scheme: proxy.netloc})
            opener = urllib2.build_opener(proxy)
            urllib2.install_opener(opener)

        req = urllib2.Request(url, headers=headers, data=contents)
        req.get_method = lambda: method
        conn = urllib2.urlopen(req)
        body = conn.read()
        try:
            body_data = json.loads(body)
        except ValueError:
            body_data = None
        trans_stop = time()
        if logger:
            sent_content_length = 0
            for n, v in headers.items():
                nl = n.lower()
                if nl == 'content-length':
                    try:
                        sent_content_length = int(v)
                        break
                    except ValueError:
                        pass
            logger.debug("-> " + " ".join(
                quote(str(x) if x else "-", ":/")
                for x in (
                    strftime('%Y-%m-%dT%H:%M:%S', gmtime(trans_stop)),
                    method,
                    url,
                    conn.getcode(),
                    sent_content_length,
                    conn.info()['content-length'],
                    trans_start,
                    trans_stop,
                    trans_stop - trans_start,
                    additional_info
                )))
        return [None, body_data]
示例#7
0
    def base_request(self,
                     method,
                     container=None,
                     name=None,
                     prefix=None,
                     headers=None,
                     proxy=None,
                     contents=None,
                     full_listing=None,
                     logger=None,
                     additional_info=None):
        # Common request method
        trans_start = time()
        url = self.url

        if headers is None:
            headers = {}

        if self.token:
            headers['X-Auth-Token'] = self.token

        if container:
            url = '%s/%s' % (url.rstrip('/'), quote(container))

        if name:
            url = '%s/%s' % (url.rstrip('/'), quote(name))
        else:
            url += '?format=json'
            if prefix:
                url += '&prefix=%s' % prefix

        if proxy:
            proxy = urlparse.urlparse(proxy)
            proxy = urllib2.ProxyHandler({proxy.scheme: proxy.netloc})
            opener = urllib2.build_opener(proxy)
            urllib2.install_opener(opener)

        req = urllib2.Request(url, headers=headers, data=contents)
        req.get_method = lambda: method
        urllib2.urlopen(req)
        conn = urllib2.urlopen(req)
        body = conn.read()
        try:
            body_data = json.loads(body)
        except ValueError:
            body_data = None
        trans_stop = time()
        if logger:
            sent_content_length = 0
            for n, v in headers.items():
                nl = n.lower()
                if nl == 'content-length':
                    try:
                        sent_content_length = int(v)
                        break
                    except ValueError:
                        pass
            logger.debug("-> " + " ".join(
                quote(str(x) if x else "-", ":/")
                for x in (strftime('%Y-%m-%dT%H:%M:%S', gmtime(trans_stop)),
                          method, url, conn.getcode(), sent_content_length,
                          conn.info()['content-length'], trans_start,
                          trans_stop, trans_stop - trans_start,
                          additional_info)))
        return [None, body_data]
示例#8
0
def password_protected_page_downloader(dbConn, log):
    """
    *get a page that is behind HTTPS authentication password protection*

    **Key Arguments:**
      - ``dbConn`` -- mysql database connection
      - ``log`` -- logger
      - ``___`` --

    **Return:**
      - None
    """
    ################ > IMPORTS ################
    ## STANDARD LIB ##
    ## THIRD PARTY ##
    ## LOCAL APPLICATION ##
    import commands
    import urllib2

    theurl = 'https://groups.google.com/a/pessto.org/group/alerts/manage_members/alerts.csv'
    username = '******'
    password = '******'
    # a great password

    passman = urllib2.HTTPPasswordMgrWithDefaultRealm()
    # this creates a password manager
    passman.add_password(None, theurl, username, password)
    # because we have put None at the start it will always
    # use this username/password combination for  urls
    # for which `theurl` is a super-url

    authhandler = urllib2.HTTPBasicAuthHandler(passman)
    # create the AuthHandler

    opener = urllib2.build_opener(authhandler)

    urllib2.install_opener(opener)
    # All calls to urllib2.urlopen will now use our handler
    # Make sure not to include the protocol in with the URL, or
    # HTTPPasswordMgrWithDefaultRealm will be very confused.
    # You must (of course) use it when fetching the page though.

    pagehandle = urllib2.urlopen(theurl)
    # authentication is now handled automatically for us

    ################ > VARIABLE SETTINGS ######
    # command = 'wget --output-document=- --quiet --http-user=david.young --http-password=spac3d0ct0r https://groups.google.com/a/pessto.org/group/alerts/manage_members/alerts.csv'
    # status, text = commands.getstatusoutput(command)

    # url = "https://david.young:[email protected]/a/pessto.org/group/alerts/manage_members/alerts.csv"
    # try:
    #   urllib2.urlopen(urllib2.Request(url))
    # except urllib2.HTTPError, e:
    #   print e.headers
    #   print e.headers.has_key('WWW-Authenticate')

    ################ >ACTION(S) ################
    # log.debug('status %s' % (status,))
    # log.debug('text %s' % (text,))

    return
示例#9
0
    def main(self,
             start_url,
             block_extensions=[
                 '.pdf', '.gif', '.jpg', '.JPG', '.PNG', '.png', '.wav',
                 '.mp3', '.wma'
             ],
             max_urls=100):

        # Set user agent string
        opener = urllib2.build_opener()
        opener.addheaders = [
            ('User-agent',
             'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.220 Safari/535.1'
             ),
            ('Accept',
             'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
             ),
            ('Accept-Charset', 'utf-8,gbk;q=0.7,*;q=0.3'),
            #('Accept-Encoding', 'gzip,deflate,sdch'),
            ('Accept-Language', 'en-US,en,en-zh;q=0.8'),
            #('Cache-Control', 'max-age=0'),
            #('Connection', 'keep-alive')
        ]
        urllib2.install_opener(opener)

        # Get base info
        (scheme, netloc, path, params, query,
         fragment) = urlparse.urlparse(start_url)
        fragments = (scheme, netloc, '', '', '', '')
        base_url = urlparse.urlunparse(fragments)
        #print "base_url  -> ", base_url

        mainLink = LinkInfo(None, base_url, u'Main', 0, u'first page')
        self.assignID(mainLink)

        urls_queue = set([mainLink])
        urls_crawled = set()
        urls_crawled2 = set()

        pool = eventlet.GreenPool(20)

        counter = 0
        tmpC = 0
        while True:
            #Infinite loop sanity check
            counter += 1
            if counter > max_urls:
                break

            for url, body in pool.imap(self.fetch, urls_queue):
                # Remove this url from the queue set
                urls_queue = urls_queue - set([url])

                # Add url to crawled set
                urls_crawled = urls_crawled.union(set([url]))
                urls_crawled2 = urls_crawled2.union(set([url]))

                # Extract links
                links = self.extract_links(url, body, block_extensions)
                if (links == None): return urls_crawled
                if tmpC == 100000: return urls_crawled
                tmpC += 1
                for link in links:
                    if link not in urls_queue and link not in urls_crawled:
                        # Add link to queue
                        urls_queue = urls_queue.union(set([link]))
                        print u"[valid]: link -> ", link.link

        return urls_crawled