Exemplo n.º 1
0
	def loop(self, url, next, post=None, cb=None, cc = 1, deep=2, debug=0, allow_external = False, link_filter=None, start_now=True,  **options):

		doneurls = [common.md5(url)]
		
		domain = common.get_domain(url).lower()



		def page_loaded(doc):

			if doc.req['meta']['deep']<deep:
				for n in doc.q(next):
					nexturl = n.nodevalue()

					if domain != common.get_domain(nexturl):
						continue
					if link_filter and not link_filter(url=nexturl):
						continue

					if common.md5(nexturl) not in doneurls:					
						doneurls.append(common.md5(nexturl))
						req = Request(url=nexturl, meta=dict(deep=doc.req['meta']['deep']+1),use_cache=True,  cb = page_loaded, **options)
						self.downloader.put(req)
			
			#allow the loop caller proccessing each loaded page			
			if cb:
				cb(doc)
		
		
		self.downloader.put(Request(url=url, post=post, meta=dict(deep=1), use_cache=True, cb = page_loaded, **options))			

		self.downloader.cc = cc
		if start_now:
			self.downloader.start()
Exemplo n.º 2
0
	def loop(self, url, next, post=None, cb=None, cc = 1, deep=2, debug=0, allow_external = False, link_filter=None, start_now=True,  **options):

		doneurls = [common.md5(url)]
		
		domain = common.get_domain(url).lower()



		def page_loaded(doc):

			if doc.req['meta']['deep']<deep:
				for n in doc.q(next):
					nexturl = n.nodevalue()

					if domain != common.get_domain(nexturl):
						continue
					if link_filter and not link_filter(url=nexturl):
						continue

					if common.md5(nexturl) not in doneurls:					
						doneurls.append(common.md5(nexturl))
						req = Request(url=nexturl, meta=dict(deep=doc.req['meta']['deep']+1),use_cache=True,  cb = page_loaded, **options)
						self.downloader.put(req)
			
			#allow the loop caller proccessing each loaded page			
			if cb:
				cb(doc)
		
		
		self.downloader.put(Request(url=url, post=post, meta=dict(deep=1), use_cache=True, cb = page_loaded, **options))			

		self.downloader.cc = cc
		if start_now:
			self.downloader.start()
Exemplo n.º 3
0
    def mine_emails(self, url):
        """ 
		looks for emails on key pages of a website: homepage, contact

		"""
        if not url: return []
        if not common.subreg(url, '^(http)'):
            url = 'http://' + url
        if '@' in url:
            return common.get_emails(url)
        domain = common.get_domain(url)
        emails = []

        def _parse_emails(doc):
            link_texts = doc.q("//a").join(' | ')

            for email in common.get_emails(link_texts):

                if '@' in email and email not in emails:
                    emails.append(email)

            if not emails:
                #try with text only, not links
                html = doc.remove("//script").html()
                for email in common.get_emails(html):

                    if '@' in email and email not in emails:
                        emails.append(email)

        homepage = self.load(url)
        _parse_emails(homepage)

        if emails:
            #no need to look on other page
            return emails

        contact_url = homepage.x(
            "//a[contains(@href,'contact') or contains(@href,'Contact')]/@href"
        )

        if contact_url:
            contactpage = self.load(contact_url)
            _parse_emails(contactpage)

        return emails
Exemplo n.º 4
0
	def mine_emails(self, url):
		""" 
		looks for emails on key pages of a website: homepage, contact

		"""
		if not url: return []
		if not common.subreg(url, '^(http)'):
			url = 'http://'+url
		if '@' in url:
			return common.get_emails(url)
		domain = common.get_domain(url)
		emails = []
		def _parse_emails(doc):
			link_texts = doc.q("//a").join(' | ')
			
			for email in common.get_emails(link_texts):
			
				if '@' in email and email not in emails:
					emails.append(email)

			if not emails:
				#try with text only, not links
				html = doc.remove("//script").html()
				for email in common.get_emails(html):
			
					if '@' in email and email not in emails:
						emails.append(email)

		
		homepage = self.load(url)
		_parse_emails(homepage)
		
		
		if emails:
			#no need to look on other page
			return emails		

		contact_url = homepage.x("//a[contains(@href,'contact') or contains(@href,'Contact')]/@href")

		if contact_url:
			contactpage = self.load(contact_url)
			_parse_emails(contactpage)
		

		return emails
Exemplo n.º 5
0
		def page_loaded(doc):

			if doc.req['meta']['deep']<deep:
				for n in doc.q(next):
					nexturl = n.nodevalue()

					if domain != common.get_domain(nexturl):
						continue
					if link_filter and not link_filter(url=nexturl):
						continue

					if common.md5(nexturl) not in doneurls:					
						doneurls.append(common.md5(nexturl))
						req = Request(url=nexturl, meta=dict(deep=doc.req['meta']['deep']+1),use_cache=True,  cb = page_loaded, **options)
						self.downloader.put(req)
			
			#allow the loop caller proccessing each loaded page			
			if cb:
				cb(doc)
Exemplo n.º 6
0
		def page_loaded(doc):

			if doc.req['meta']['deep']<deep:
				for n in doc.q(next):
					nexturl = n.nodevalue()

					if domain != common.get_domain(nexturl):
						continue
					if link_filter and not link_filter(url=nexturl):
						continue

					if common.md5(nexturl) not in doneurls:					
						doneurls.append(common.md5(nexturl))
						req = Request(url=nexturl, meta=dict(deep=doc.req['meta']['deep']+1),use_cache=True,  cb = page_loaded, **options)
						self.downloader.put(req)
			
			#allow the loop caller proccessing each loaded page			
			if cb:
				cb(doc)
Exemplo n.º 7
0
def mine_emails(url, br=None, deep_level=1):
    """
	deep_level = 1: scrape home page and contact page only

	"""

    if not url: return []
    if not common.subreg(url, '^(http)'):
        url = 'http://' + url
    if '@' in url:
        return common.get_emails(url)

    domain = common.get_domain(url).lower()

    history = {}

    def _load_page(page_url, current_level):
        """
		Please make sure this _url is not loaded yet, to avoid loaded twice

		"""
        logger.debug('mine_emails page %s, level %s', page_url, current_level)
        html = ''
        if br:
            try:
                br.get(page_url)

                html = br.page_source

            except Exception as e:
                logger.warn('failed to _load_page: %s', page_url)
                # logger.exception(e)
                raise e  #to trigger new br

        else:

            html = s.load_html(page_url)

        doc = Doc(url=page_url, html=html)
        #update loaded links
        links = doc.q("//a")

        sub_urls = []

        for link in links:
            _url = link.href()

            if domain not in _url.lower():
                continue

            if _url in history:
                continue
            if _url not in sub_urls:
                sub_urls.append(_url)

        history[page_url] = (current_level + 1, sub_urls)

        return doc

    def _parse_emails(doc):
        emails = []
        #firstly try to get emails from the links only because it's more reliable
        link_texts = doc.q("//a").join(' | ')

        for email in common.get_emails(link_texts):

            if '@' in email and email not in emails:
                emails.append(email)

        if not emails:
            #try with text only, not links
            html = doc.remove("//script").html()
            for email in common.get_emails(html):

                if '@' in email and email not in emails:
                    emails.append(email)
        return emails

    def _load_subpages(level):
        #firstly, compile all the urls of this level in the history
        urls = []
        for url in history:
            _level, suburls = history[url]
            if _level != level:
                continue

            for suburl in suburls:
                if suburl in history:
                    continue

                if suburl not in urls:
                    urls.append(suburl)

        logger.debug('mine emails in level %s, with %s urls to process', level,
                     len(urls))
        for suburl in urls:

            doc = _load_page(suburl, level)
            emails = _parse_emails(doc)
            if emails:
                #found emails on this page, enough
                return emails

        #not found
        return []

    doc = _load_page(url, current_level=1)
    emails = _parse_emails(doc)

    if emails:
        return emails

    contact_url = doc.x(
        "//a[contains(@href,'contact') or contains(@href,'Contact')]/@href")
    if contact_url:
        doc = _load_page(contact_url, current_level=2)
        emails = _parse_emails(doc)

        #when a contact page found, no need to dig further even if no emails found

        return emails

    #try with level 2

    if deep_level >= 2:
        emails = _load_subpages(level=2)
        if emails:
            return emails

    #try with level 3

    if deep_level >= 3:
        emails = _load_subpages(level=3)
        if emails:
            return emails

    #not found
    return []
Exemplo n.º 8
0
def mine_emails(url, br=None, deep_level=1):
	"""
	deep_level = 1: scrape home page and contact page only

	"""
	

	if not url: return []
	if not common.subreg(url, '^(http)'):
		url = 'http://'+url
	if '@' in url:
		return common.get_emails(url)

	domain = common.get_domain(url).lower()

	history = {}

	def _load_page(page_url, current_level):
		"""
		Please make sure this _url is not loaded yet, to avoid loaded twice

		"""
		logger.debug('mine_emails page %s, level %s', page_url, current_level)
		html = ''
		if br:
			try:
				br.get(page_url)

				html = br.page_source

			except Exception as e:
				logger.warn('failed to _load_page: %s', page_url)
				# logger.exception(e)
				raise e #to trigger new br

		else:
			
			html = s.load_html(page_url)		

		
		doc = Doc(url=page_url, html=html)
		#update loaded links
		links = doc.q("//a")
		
		sub_urls = []

		for link in links:
			_url = link.href()

			if domain not in _url.lower():
				continue

			if _url in history:
				continue
			if _url not in sub_urls:	
				sub_urls.append(_url)	

		history[page_url] = (current_level+1, sub_urls)


		return doc		
				


	def _parse_emails(doc):
		emails = []
		#firstly try to get emails from the links only because it's more reliable
		link_texts = doc.q("//a").join(' | ')
		
		for email in common.get_emails(link_texts):
		
			if '@' in email and email not in emails:
				emails.append(email)

		if not emails:
			#try with text only, not links
			html = doc.remove("//script").html()
			for email in common.get_emails(html):
		
				if '@' in email and email not in emails:
					emails.append(email)	
		return emails			

	def _load_subpages(level):
		#firstly, compile all the urls of this level in the history
		urls = []
		for url in history:
			_level, suburls = history[url]
			if _level != level:
				continue

			for suburl in suburls:
				if suburl in history:
					continue

				if suburl not in urls:
					urls.append(suburl)	

		logger.debug('mine emails in level %s, with %s urls to process', level, len(urls))
		for suburl in urls:			
		
			doc = _load_page(suburl, level)
			emails = _parse_emails(doc)
			if emails:
				#found emails on this page, enough
				return emails

		#not found
		return []		

	
	doc = _load_page(url, current_level = 1)
	emails = _parse_emails(doc)
	
	if emails:
		return emails

	contact_url = doc.x("//a[contains(@href,'contact') or contains(@href,'Contact')]/@href")
	if contact_url:
		doc = _load_page(contact_url, current_level = 2)
		emails = _parse_emails(doc)
		
		#when a contact page found, no need to dig further even if no emails found

		return emails
		
	
	#try with level 2

	if deep_level >=2:
		emails = _load_subpages(level=2)
		if emails:
			return emails

	#try with level 3

	if deep_level >=3:
		emails = _load_subpages(level=3)
		if emails:
			return emails

	
	#not found
	return []