Пример #1
0
 def is_page_robot_scannable(self):
     """
     Returns a boolean that tells whether the page is robot scrapeable.
     """
     robotcheck = RobotFileParser()
     robotcheck.set_url(self.urlparse[0]+'://'+self.urlparse[1]+'/robots.txt')
     robotcheck.read()
     return robotcheck.can_fetch(settings.SPIDER_USER_AGENT, self.url)
Пример #2
0
 def test_parse(self):
     from robotparser import RobotFileParser
     rules = RobotFileParser()
     rules.set_url("http://www.sogou.com/robots.txt")
     rules.read()
     self.assertEqual(
         rules.can_fetch("mozilla", "http://www.sogou.com/sohu/robots.txt"),
         False)
Пример #3
0
    def _get_robot_parser(self):
        if self.robot_parser_pickle is not None:
            return pickle.loads(base64.b64decode(self.robot_parser_pickle))
        else:
            parser = RobotFileParser()
            parser.set_url(self.protocol + "://" + self.domain + "/robots.txt")
            self.robot_parser = parser

            return parser
Пример #4
0
 def _get_robot_parser(self):
     try:
         return pickle.loads(str(self.robot_parser_pickle))
     except (TypeError, IndexError):
         parser = RobotFileParser()
         parser.set_url(str(self.protocol) + "://" + str(self.domain) + \
                        "/robots.txt")
         self.robot_parser = parser
         return parser
Пример #5
0
    def _get_robot_parser(self):
        if self.robot_parser_pickle is not None:
            return pickle.loads(base64.b64decode(self.robot_parser_pickle))
        else:
            parser = RobotFileParser()
            parser.set_url(self.protocol + "://" + self.domain + "/robots.txt")
            self.robot_parser = parser

            return parser
Пример #6
0
def checkRobots(URL):

    time.sleep(1)
    parsed = urlparse(URL)
    robotsUrl = parsed.scheme + "://" + parsed.netloc + "/robots.txt"
    robotParser = RobotFileParser()
    robotParser.set_url(robotsUrl)
    robotParser.read()
    result = robotParser.can_fetch("*", URL)
    return result
Пример #7
0
def checkRobots(URL):

	time.sleep(1)
	parsed = urlparse(URL)
	robotsUrl = parsed.scheme + "://"+ parsed.netloc+"/robots.txt"
	robotParser = RobotFileParser()
	robotParser.set_url(robotsUrl)
	robotParser.read()
	result = robotParser.can_fetch("*",URL)
	return result
Пример #8
0
def get_robots(url):
    '''
    Initialize robots parser for this domain
    :param url:
    :return:
    '''
    rp = RobotFileParser()
    rp.set_url(urlparse.urljoin(url, '/robots.txt'))
    rp.read()
    return rp
Пример #9
0
 def can_fetch(self, url):
     host, path = urlparse.urlparse(url)[1:3]
     if (self.rules.has_key(host)):
         return self.rules[host].can_fetch(self.agent, url)
     else:
         rp = RobotFileParser()
         robot_url = "http://" + host + "/robots.txt"
         rp.set_url(robot_url)
         rp.read()
         self.rules[host] = rp
         return rp.can_fetch(self.agent, url)
Пример #10
0
	def can_fetch(self,url):
		host,path=urlparse.urlparse(url)[1:3]
		if	(self.rules.has_key(host)):
			return self.rules[host].can_fetch(self.agent,url)
		else:
			rp=RobotFileParser()
			robot_url="http://"+host+"/robots.txt"
			rp.set_url(robot_url)
			rp.read()
			self.rules[host]=rp
			return rp.can_fetch(self.agent,url)	
Пример #11
0
class HolidayScrapper:
    def __init__(self):
        self.rp = RobotFileParser()
        self.rp.set_url('https://www.timeanddate.com/robots.txt')
        self.rp.read()
        if not self.rp.can_fetch('WasThereAHoliday', init_url):
            raise RuntimeError('Scrapping forbidden due to robots.txt file')
        self.countries = self.get_countries(self.get_page(init_url))
        try:
            # removing entries which are not countries
            self.countries.remove('un')
        except ValueError:
            pass
        try:
            # removing entries which are not countries
            self.countries.remove('world')
        except ValueError:
            pass

    def get_data(self):
        all_data = pd.DataFrame()
        for cntr in self.countries:
            print 'Fetching data for ' + cntr
            try:
                url = 'https://www.timeanddate.com/holidays/' + cntr + '/2016#!hol=8389401'
                if not self.rp.can_fetch('WasThereAHoliday', url):
                    raise RuntimeError(
                        'Scrapping forbidden due to robots.txt file')
                soup = self.get_page('https://www.timeanddate.com/holidays/' +
                                     cntr + '/2016#!hol=8389401')
                html_table = soup.find('table')
                df_table = pd.read_html(str(html_table))[0]
                df_table['country'] = cntr
                all_data = all_data.append(df_table)
            except ValueError:
                print 'Problem occured when fetching data for ' + cntr
                pass
        return all_data

    @staticmethod
    def get_page(url):
        page = requests.get(url, headers=headers)
        soup = BeautifulSoup(page.text, 'lxml')
        return soup

    @staticmethod
    def get_countries(soup):
        countries = []
        select_list = soup.find(id="co")
        for cntr in select_list.children:
            countries.append(cntr['value'])
        return countries
Пример #12
0
def robots_check(url):

    # creating url for robots.txt
    root_url = tld.get_tld(url)
    prefix = "http://www."
    suffix = "/robots.txt"
    robots_url = prefix + root_url + suffix

    # checking url validity
    rp = RobotFileParser()
    rp.set_url(robots_url)
    rp.read()
    return rp.can_fetch("*", url)
Пример #13
0
class Host(object):
    ''' Represents one host. Responsible for parsing and analyzing
    ``robots.txt``.
    
    :param hostname: the name of the host extracted from an URL.
    '''
    def __init__(self, hostname):
        self.hostname = hostname
        
        self.rp = RobotFileParser()
        self.rp.set_url('http://%s/robots.txt' % self.hostname)
        
    def url_allowed(self, url):
        ''' Checks if the given url is allowed to crawl.
        
        :param url: URL to check.
        '''
        return self.rp.can_fetch(USER_AGENT, url)
def can_read(url):

    domain = domain_name(url)
    if domain not in Permissions:
        rp = RobotFileParser()
        rp.set_url(urljoin('http://' + domain, 'robots.txt'))
        try:
            rp.read()
        except:
            return False

        Permissions[domain] = rp

    res = False
    try:
        res = Permissions[domain].can_fetch("*", url)
    except:
        return False

    return res
Пример #15
0
def _get_soup(path):
    """Gets soup from the given path, respecting robots.txt"""

    full_path = BASE_URL + path

    # Set a user-agent
    user_agent = 'dcnotify/%s' % __version__
    http_headers = {'User-Agent': '%s' % user_agent}

    # Honor robots.txt
    robots = RobotFileParser()
    robots.set_url("%s/robots.txt" % BASE_URL)
    robots.read()
    if not robots.can_fetch(user_agent, full_path):
        raise ValueError("Path disallowed by robots.txt")

    # Make a make a request, raising any HTTP errors that might occur
    request = get(full_path, headers=http_headers)
    request.raise_for_status()

    return bs(request.text)
Пример #16
0
    def urlopen(self, host):
        robo_url = host.get_robots_url()

        print self.robotdict

        cached_parser = self.robotdict.get(robo_url)
        if cached_parser:
            logging.info("Found in Cache: " + robo_url)
        else:
            logging.info("Fetching: " + robo_url)
            cached_parser = RobotFileParser()
            self.robotdict.put(robo_url, cached_parser)
            cached_parser.set_url(robo_url)
            cached_parser.read()

        if cached_parser.can_fetch('*', host. get_url()):
            print 'Going to fetch:', host.get_url()
            return self.fetch_file(host.get_url())
        else:
            logging.info("Forbidden by Robots.txt")
            return None
Пример #17
0
def _get_soup(path):
    """Gets soup from the given path, respecting robots.txt"""

    full_path = BASE_URL + path

    # Set a user-agent
    user_agent = 'dcnotify/%s' % __version__
    http_headers = {'User-Agent': '%s' % user_agent}

    # Honor robots.txt
    robots = RobotFileParser()
    robots.set_url("%s/robots.txt" % BASE_URL)
    robots.read()
    if not robots.can_fetch(user_agent, full_path):
        raise ValueError("Path disallowed by robots.txt")

    # Make a make a request, raising any HTTP errors that might occur
    request = get(full_path, headers=http_headers)
    request.raise_for_status()

    return bs(request.text)
Пример #18
0
class spider(object):
    CurLink = ""
    linknText = []
    headings = []

    def __init__(self, link):
        self.CurLink = link
        self.r = RobotFileParser()

    def crawl(self):
        self.r.set_url(urlparse.unquote(self.CurLink))
        self.r.read()

        self.html = urlopen(self.CurLink).read()
        self.bs = BeautifulSoup(self.html, "lxml")

        for i in self.bs.findAll("h1", text=True):
            self.headings.append(i.text)
        for i in self.bs.findAll("h2", text=True):
            self.headings.append(i.text)
        for i in self.bs.findAll("h3", text=True):
            self.headings.append(i.text)
        for i in self.bs.findAll("h4", text=True):
            self.headings.append(i.text)
        for i in self.bs.findAll("h5", text=True):
            self.headings.append(i.text)
        for i in self.bs.findAll("h6", text=True):
            self.headings.append(i.text)

        for link in self.bs.findAll('a', href=True):
            aLink = urlparse.urljoin(self.CurLink, link['href'])

            if (self.r.can_fetch("*", aLink)):
                self.linknText.append({
                    "URL": aLink,
                    "AnchorText": link.string
                })
Пример #19
0
class Crawler():
	
	# Variables
	parserobots = False
	output 	= None
	report 	= False

	config 	= None
	domain	= ""

	exclude = []
	skipext = []
	drop    = []
	
	debug	= False

	tocrawl = set([])
	crawled = set([])
	excluded = set([])

	marked = {}

	# TODO also search for window.location={.*?}
	linkregex = re.compile(b'<a href=[\'|"](.*?)[\'"].*?>')

	rp = None
	response_code={}
	nb_url=1 # Number of url.
	nb_rp=0 # Number of url blocked by the robots.txt
	nb_exclude=0 # Number of url excluded by extension or word
	
	output_file = None

	target_domain = ""

	def __init__(self, parserobots=False, output=None, report=False ,domain="", exclude=[], skipext=[], drop=[], debug=False):
		self.parserobots = parserobots
		self.output 	= output
		self.report 	= report
		self.domain 	= domain
		self.exclude 	= exclude
		self.skipext 	= skipext
		self.drop		= drop
		self.debug		= debug

		if self.debug:
			logging.basicConfig(level=logging.DEBUG)

		self.tocrawl = set([domain])

		try:
			self.target_domain = urlparse.urlparse(domain)[1]
		except:
			raise ValueError("Invalid domain")


		if self.output:
			try:
				self.output_file = open(self.output, 'w')
			except:
				logging.debug ("Output file not available.")
				exit(255)

	def run(self):
		print (config.xml_header, file if file else self.output_file)

		logging.debug("Start the crawling process")

		while len(self.tocrawl) != 0:
			self.__crawling()

		logging.debug("Crawling as reach the end of all found link")

		print (config.xml_footer, file if file else self.output_file)


	def __crawling(self):
		crawling = self.tocrawl.pop()

		url = urlparse.urlparse(crawling)
		self.crawled.add(crawling)
		request = Request(crawling, headers={"User-Agent":config.crawler_user_agent})
		
		try:
			response = urlopen(request)
		except Exception as e:
			if hasattr(e,'code'):
				if e.code in self.response_code:
					self.response_code[e.code]+=1
				else:
					self.response_code[e.code]=1

				# Gestion des urls marked pour le reporting
				if self.report:
					if e.code in self.marked:
						self.marked[e.code].append(crawling)
					else:
						self.marked[e.code] = [crawling]

			logging.debug ("{1} ==> {0}".format(e, crawling))
			return self.__continue_crawling()

		# Read the response
		try:
			msg = response.read()
			if response.getcode() in self.response_code:
				self.response_code[response.getcode()]+=1
			else:
				self.response_code[response.getcode()]=1

			response.close()

			# Get the last modify date
			if 'last-modified' in response.headers:
				date = response.headers['Last-Modified']
			else:
				date = response.headers['Date']

			date = datetime.strptime(date, '%a, %d %b %Y %H:%M:%S %Z')

		except Exception as e:
			logging.debug ("{1} ===> {0}".format(e, crawling))
			return None


		print ("<url><loc>"+url.geturl()+"</loc><lastmod>"+date.strftime('%Y-%m-%dT%H:%M:%S+00:00')+"</lastmod></url>", file if file else self.output_file)
		if self.output_file:
			self.output_file.flush()

		# Found links
		links = self.linkregex.findall(msg)
		for link in links:
			link = link.decode("utf-8")
			#logging.debug("Found : {0}".format(link))		
			if link.startswith('/'):
				link = 'http://' + url[1] + link
			elif link.startswith('#'):
				link = 'http://' + url[1] + url[2] + link
			elif not link.startswith('http'):
				link = 'http://' + url[1] + '/' + link
			
			# Remove the anchor part if needed
			if "#" in link:
				link = link[:link.index('#')]

			# Drop attributes if needed
			for toDrop in self.drop:
				link=re.sub(toDrop,'',link)

			# Parse the url to get domain and file extension
			parsed_link = urlparse.urlparse(link)
			domain_link = parsed_link.netloc
			target_extension = os.path.splitext(parsed_link.path)[1][1:]

			if (link in self.crawled):
				continue
			if (link in self.tocrawl):
				continue
			if (link in self.excluded):
				continue
			if (domain_link != self.target_domain):
				continue
			if ("javascript" in link):
				continue
			
			# Count one more URL
			self.nb_url+=1

			# Check if the navigation is allowed by the robots.txt
			if (not self.can_fetch(link)):
				self.exclude_link(link)
				self.nb_rp+=1
				continue

			# Check if the current file extension is allowed or not.
			if (target_extension in self.skipext):
				self.exclude_link(link)
				self.nb_exclude+=1
				continue

			# Check if the current url doesn't contain an excluded word
			if (not self.exclude_url(link)):
				self.exclude_link(link)
				self.nb_exclude+=1
				continue

			self.tocrawl.add(link)
			
		return None

	def __continue_crawling(self):
		if self.tocrawl:
			self.__crawling()

	def exclude_link(self,link):
		if link not in self.excluded:
			self.excluded.add(link)

	def checkRobots(self):
		if self.domain[len(self.domain)-1] != "/":
			self.domain += "/"
		request = Request(self.domain+"robots.txt", headers={"User-Agent":config.crawler_user_agent})
		self.rp = RobotFileParser()
		self.rp.set_url(self.domain+"robots.txt")
		self.rp.read()

	def can_fetch(self, link):
		try:
			if self.parserobots:
				if self.rp.can_fetch("*", link):
					return True
				else:
					logging.debug ("Crawling of {0} disabled by robots.txt".format(link))
					return False

			if not self.parserobots:
				return True

			return True
		except:
			# On error continue!
			logging.debug ("Error during parsing robots.txt")
			return True

	def exclude_url(self, link):
		for ex in self.exclude:
			if ex in link:
				return False
		return True

	def make_report(self):
		print ("Number of found URL : {0}".format(self.nb_url))
		print ("Number of link crawled : {0}".format(len(self.crawled)))
		if self.parserobots:
			print ("Number of link block by robots.txt : {0}".format(self.nb_rp))
		if self.skipext or self.exclude:
			print ("Number of link exclude : {0}".format(self.nb_exclude))

		for code in self.response_code:
			print ("Nb Code HTTP {0} : {1}".format(code, self.response_code[code]))

		for code in self.marked:
			print ("Link with status {0}:".format(code))
			for uri in self.marked[code]:
				print ("\t- {0}".format(uri))
Пример #20
0
class WebPage(object):
    def __init__(self, url):
        self.page_url = url
        self.parsed_url = urlparse.urlparse(url)
        self.lang = ""
        self.isDownload = False
        self.title = ""
        self.text = ""
        self.soup = None
        self.robot = RobotFileParser()

    def __normalize_link__(self, link):
        if not link:
            return None
        if link.startswith('//'):
            return self.parsed_url.scheme + ':' + link
        elif link.startswith('/'):
            return self.parsed_url.scheme + '://' + self.parsed_url.hostname + link
        elif link.startswith('http://') or link.startswith('https://'):
            return link
        elif link.startswith("irc://"):
            return None
        elif link.startswith('#') or link.startswith('javascript:'):
            return None
        else:
            return urlparse.urljoin(self.page_url, link)

    def __delete_unnecessary_tags(self):
        if self.soup is None:
            return

        if self.soup.title is None:
            self.title = ""
        else:
            self.title = self.soup.title.string

        for tag in self.soup(
            ['style', 'script', '[document]', 'head', 'title']):
            tag.decompose()

    def __get_stems(self, text):
        if self.lang in LANGUAGES:
            stemer = snowballstemmer.stemmer(LANGUAGES[self.lang])
        else:
            raise NotImplementedError("That lang not implemented")
        stems_dict = dict()

        for char in [",", ". ", "!", "?", " - ", "/n"]:
            text = text.replace(char, " ")

        for word in text.split():
            stem_word = stemer.stemWord(word.lower())
            if stem_word in stems_dict:
                stems_dict[stem_word] += 1
            else:
                stems_dict[stem_word] = 1

        return stems_dict

    def download_page(self):
        try:
            self.robot.set_url("{0}://{1}/robots.txt".format(
                self.parsed_url.scheme, self.parsed_url.hostname))
            self.robot.read()
            if self.robot.can_fetch("*", self.page_url):
                response = requests.get(self.page_url, verify=False)
            else:
                return False
        except requests.exceptions.InvalidSchema:
            return False
        except KeyError:
            return False
        except Exception:
            return False

        if response.status_code == 200:
            self.soup = BeautifulSoup(response.text, "html.parser")
            self.__delete_unnecessary_tags()
            self.text = "".join(self.soup.strings)
            try:
                self.lang = detect(self.text)
            except Exception:
                self.lang = "en"
            self.isDownload = True
            return True
        else:
            return False

    def get_links(self):
        if not self.isDownload:
            raise Exception("You should download page")

        def get_links_generator():
            for link in self.soup.find_all("a"):
                normalized_link = self.__normalize_link__(link.get("href"))
                if normalized_link is None:
                    continue
                else:
                    yield normalized_link

        return get_links_generator()

    def get_text_stems(self):
        if not self.isDownload:
            raise Exception("You should download page")
        return self.__get_stems(self.text)

    def get_title_stems(self):
        if not self.isDownload:
            raise Exception("You should download page")
        return self.__get_stems(self.title)

    def get_domain(self):
        return self.parsed_url.hostname
Пример #21
0
class SiteMap():

    def __init__(self, main_page=None, robotrules=True):
        """
            Constuctor method that initializes the members that are used during crawling process
        :param main_page: The root page that needs to be crawled for generation of sitemap
        """

        logging.info("Consider Robot.txt ? ==> "+str(robotrules))
        self.robotrules = robotrules
        self.site_map = {}                          # map that records the visits of urls, datemodified and assets
        self.network = {}                           # map that maintains the network/graph of webpages visited
                                                    # The intention of this map is for visual rendering using d3.js

        self.unvisited = set([])                    # a set to keep the list of urls yet to be visited
        self.start_page = None                       # the root page, this is used to avoid cycle and keeping crawl
                                                    # process limited to single domain.
        self.robot_txt_rules = None

        if main_page:
            self.unvisited.add(main_page)
            try:
                self.start_page = urlparse(main_page).netloc
            except:
                logging.error("Improper URL, Please provide a Valid Url:"+main_page)
                exit(0)

        if self.robotrules == "True":
            try:
                logging.info("robot.txt respected")
                self.robot_txt_rules = RobotFileParser()
                self.robot_txt_rules.set_url(main_page + "/robots.txt")
                self.robot_txt_rules.read()
            except:
                logging.error("Unable to read the robot.txt file")
                self.robotrules = False # error reading robot.txt, ignore it forever

    @timeit
    def generate(self, site_map=None):
        """
            This method holds the invoking control of the crawler method and drives the crawling process.
            Basically a BFS style method that keeps popping the elements from the queue [self.unvisited set]
            and scraping the urls.

            Once the crawling process is done, this creates sitemap using the self.site_map dictionary with
            just url, date-modified tags with dummy frequency and priorities.
        :param site_map: name of the site_map file so as to create xml entries.
        :return:
        """
        while self.unvisited:
            self.crawl()
        # create xml from the site_map dictionary
        header = """<?xml version="1.0" encoding="UTF-8"?>
            <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
            xmlns:xhtml="http://www.w3.org/1999/xhtml"
            xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
            xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9
            http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">
        """
        footer = """\n</urlset>\n"""
        entry = "\t<url>\n\
                 \t\t<loc>%s</loc>\n\
                 \t\t<lastmod>%s</lastmod>\n\
                 \t\t<changefreq>monthly</changefreq>\n\
                 \t\t<priority> 1 </priority>\n\
                 \t</url>\
        "

        xml = header
        for url in self.site_map.keys():
            xml += entry % (url, self.site_map[url]['date']) + "\n"

        xml += footer
        if site_map != None:
            self.write_to_file(site_map, xml)
        else:
            self.write_to_file("sitemap.xml", xml)
        return xml

    def write_to_file(self, file_name, content):
        """
            A utility method to just write the contents of the file into a given file name.
            Alert: This overwrites if the file does exist in the current directory.
        :param file_name: name of the file, sitemap in our case.
        :param content:   contents of the file
        :return: None
        """
        f = open(file_name, 'w')
        f.write(content)
        f.close()

    def compose_url_from_href(self, url, href):
        """
            There are different ways a href could specify a location and it varies in different ways based on how
            the page is designed. This method takes few styles into consideration and ignores some, cleans and creates
            a valid url link so as to keep it ready for the crawl method.
        :param url:   basae url of the current page
        :param href:  one of the hyper links of the page
        :return:      a well formed and valid http link
        """
        if href.startswith('/'):
            return "http://%s%s"%(url.netloc, href)
        elif href.startswith('#'):
            return "http://%s%s%s"%(url.netloc, url.path, href)
        elif href.startswith('./'):
            return "http://%s%s"%(url.netloc, href[1:])
        elif not href.startswith('http'):
            return "http://" + url.netloc + '/' + href

        return href

    def get_out_going_edges(self, url, html_body):
        """
            This method encompasses the BFS along with the coupling with crawl and generator as it changes the state
            of the unvisited map. Basically this method extracts the links that belong to the same domain as the start
            page, cleans them with compose_url_from_href method and updates the map. This also avoids unnecessary traps
            like href links pointing to 'javascript', 'mailto' etc.
        :param url:         current page url
        :param html_body:   current page's html content
        :return:            returns all the valid and wellformed out going links from this page
        """
        soup = BeautifulSoup(html_body, "html.parser")
        valid_links_for_this_page = []
        for a in soup.find_all('a', href=True):

            href = a['href']
            href = self.compose_url_from_href(url, href.decode("utf-8"))

            # clean the href so that it will have legitimate urls instead of #cluttered ones and q=param prints
            href = urldefrag(href)[0]  # skip intra links [this took time to find out !] ##1
            # remove query params as only the path matters
            if href.find('?') != -1:
                href = href[:href.find('?')]  ##2

            new_page = urlparse(href)

            # add to the queue only it it doesn't cause a cycle
            # assumption: if a link ends with domain.com, assuming it can be crawled to make sitemap complete
            if  not str(new_page.netloc).endswith(self.start_page):          # doesn't belong to domain
                continue

            if  self.robot_allows(href) and \
                not href in self.site_map.keys()            and \
                not href in self.unvisited                  and \
                not 'javascript:' in href                   and \
                not 'mailto:' in href:
                self.unvisited.add(href)
                valid_links_for_this_page.append(href)

        return valid_links_for_this_page

    def record_visit(self, url, headers, html_body):
        """
            Any time a specific url of a site is changed, its last-modified date and time are kept in the page headers.
            This info helps bots and crawlers to not to crawl the page if it has not been updated since last crawl.
            This method is used to preserve the url crawled and its last-modified time along with assets scraped into
            the container dictionary for later usage to generate sitemap and visualization network.
        :param url:         url of the just finished crawling page
        :param headers:     header information of the crawled page
        :param html_body:   html content of the page
        :return:            None
        """
        if 'last-modified' in headers:
            date = headers['Last-Modified']
        else:
            date = headers['Date']

        self.site_map[url] = {
            'date': date,
            'assets': self.get_static_assets(html_body)
        }

    def get_static_assets(self, html_body):
        """
            A html page could contain other links such as .css, .img. .mp4 and .js. All these files are not dynamic
            though they could produce dynamic results. The code or text that exists in these files is constant and
            static. These files are referred as static assets and for the definition of this challenge, I have chosen
            to keep all the info in a single dictionary and extract them at the end for reports, results and stats.
        :param html_body:       html content of the page.
        :return:                returns a dictionary that encompasses .css, .img, ijs files as lists.
        """
        # add static assets of the page .css, .js and image urls may be ?
        soup = BeautifulSoup(html_body, "html.parser")
        img = soup.findAll("img")
        css = soup.findAll("link", {"rel": "stylesheet"})

        # js is tricky: I faced an issue with inline javascript and ignoring it for the time being.
        # an extract like html_body with just needed parts is a must for excluding inline scripts and styles.
        jss = []
        for x in soup.findAll('script'):
            try:
                list.append(x['src'])
            except KeyError:
                pass

        csss = []
        imgs = []
        jss = []
        for link in css:
            csss.append(link['href'])
        for link in img:
            imgs.append(link['src'])
        for link in jss:
            jss.append(link['src'])

        return {
                'css': csss,
                'img': imgs,
                'js':  jss
        }

    def crawl(self):
        """
            The main driver method that crawls the pages. This main does below steps:
            for every unvisited [vertex|page] that belongs to the requested domain:
                crawl the page
                record valid links and their last-modified-dates
        :return:   None
        """
        page = self.unvisited.pop()
        # if robot.txt is defined, use Disallow to avoid pages. domain.robot.txt doesn't exist so the crawler
        # must find all the pages for report.
        logging.info("Starting to Crawl Page: " + page)

        url = urlparse(page)
        try:
            response = urlopen(page)
        except:
            logging.debug("Issue with the url: " + page)
            return None
        try:
            html_body = response.read() # response.getcode()
            response.close()
            # record visit ans assets
            self.record_visit(page, response.headers, html_body)
            logging.debug("Queued Pages: {0}, Crawled Pages: {1}".format(len(self.unvisited), len(self.site_map)))
        except:
            logging.debug("Issue while opening url: " + page)
            return None
        connects = self.get_out_going_edges(url, html_body)

        # simple Graph that keeps the order of the pages crawled.
        for i, url in enumerate(connects):
            self.network[page] = {
                'to': connects,
                'assets': {
                    'css': self.site_map[page]['assets']['css'],
                    'js':  self.site_map[page]['assets']['js'],
                    'img': self.site_map[page]['assets']['img']
                }
            }
        return None

    def get_site_map(self):
        """
            Returns the compiled sitemap structure
        :return:       sitemap data structure
        """
        return self.site_map

    def get_network_graph(self):
        """
            Returns the compiled network in the order of the crawled pages
        :return:       network graph
        """
        return self.network

    def get_network_json_format(self):
        """
            Returns the crawl traverse order sequence in json format
        :return:       network in json format
        """
        return json.dumps(self.network)

    def set_start_page(self, url):
        """
            This could be useful if one is testing
        :param url: start page to start the crawling.
        :return:
        """
        self.start_page = url

    def robot_allows(self, link):
        if not self.robotrules: return True
        try:
            if self.robot_txt_rules.can_fetch("*", link):
                    return True
            return False
        except:
            return True
Пример #22
0
class SimpleCrawler:

  USER_AGENT = 'SimpleCrawler/0.1'
  HEADERS = {
    'User-Agent': USER_AGENT,
    'Accept-Encoding': 'gzip',
    'Connection': 'keep-alive'
    }
  CONTENT_TYPE_PAT = re.compile(r'([^\s;]+)(.*charset=([^\s;]+))?', re.I)
  
  def __init__(self, starturl, index_html='', maxlevel=1,
               cookie_file=None, acldb=None, urldb=None, default_charset=None,
               delay=0, timeout=300, debug=0):
    (proto, self.hostport, _x, _y, _z) = urlsplit(starturl)
    assert proto == 'http'
    #Thread.__init__(self)
    self.debug = debug
    self.index_html = index_html
    if cookie_file:
      self.cookiejar = MozillaCookieJar(cookie_file)
      self.cookiejar.load()
    else:
      self.cookiejar = None
    self.robotstxt = RobotFileParser()
    self.robotstxt.set_url(urljoin(starturl, '/robots.txt'))
    self.robotstxt.read()
    self.conn = None
    self.urldb = urldb
    self.acldb = acldb
    self.curlevel = 0
    self.delay = delay
    self.timeout = timeout
    self.default_charset = default_charset
    if starturl.endswith('/'):
      starturl += self.index_html
    self.urls = [(starturl, maxlevel)]
    self.crawled = {}                   # 1:injected, 2:crawled
    return

  def accept_url(self, url):
    if url.endswith('/'):
      url += self.index_html
    if self.acldb and not self.acldb.allowed(url):
      return None
    return url
  
  def inject_url(self, url):
    if (not self.curlevel) or (not url) or (url in self.crawled): return False
    if not self.robotstxt.can_fetch(self.USER_AGENT, url):
      if self.debug:
        print >>stderr, 'DISALLOW: %r' % url
      return None
    if self.debug:
      print >>stderr, 'INJECT: %r' % url
    self.crawled[url] = 1
    self.urls.append((url, self.curlevel-1))
    return True

  def get1(self, url, maxretry=3, maxredirect=3):
    if self.debug:
      print >>stderr, 'GET: %r' % url
    # loop
    for rtry in range(maxredirect):
      # forge urllib2.Request object.
      req = Request(url)
      # add cookie headers if necessary.
      if self.cookiejar:
        self.cookiejar.add_cookie_header(req)
        headers = req.unredirected_hdrs
        headers.update(self.HEADERS)
      else:
        headers = self.HEADERS
      # get response.
      for ctry in range(maxretry):
        try:
          if not self.conn:
            print >>stderr, 'Making connection: %r...' % (self.hostport,)
            self.conn = HTTPConnection(self.hostport)
          self.conn.request('GET', req.get_selector().replace(' ',''), '', headers)
	  self.conn.sock.settimeout(self.timeout)
          resp = self.conn.getresponse()
          break
        except BadStatusLine, x:
          # connection closed unexpectedly
          print >>stderr, 'Connection closed unexpectedly.'
          # it restarts the connection...
          self.conn.close()
          self.conn = None
        except socket.error, x:
          # connection closed unexpectedly
          print >>stderr, 'Socket error:', x
          self.conn.close()
          self.conn = None
      else:
Пример #23
0
class Webpage(object):
    """
    Objects that refer to individual webpages. If the url is scrapeable the
    object will be filled with that data, indexed, and inserted into a database
    to be searched.
    """
    number_of_scraped_pages = 0

    def __init__(self, url):
        """
        Creates a webpage object and assigns it the provided url.
        """
        self.url = url
        if self.url not in black_list and self.url not in scraped_urls:
            self.needs_to_be_scraped = True
        else:
            self.needs_to_be_scraped = False

    def page_robot_scannable(self):
        """
        Checks whether the page is allowed to be crawled
        """
        if self.need_to_be_scraped is True:
            # REFACTOR to remove try statement.
            try:
                headers = {'User-agent':settings.SPIDER_USER_AGENT}
                self.urlparse = urlparse.urlparse(self.url)
                self.robotcheck = RobotFileParser()
                self.robotcheck.set_url('http://'+self.urlparse[1]+'/robots.txt') # Only works with http right now.
                self.robotcheck.read()
                self.need_to_be_scraped = self.robotcheck.can_fetch(settings.SPIDER_USER_AGENT, self.url)
            except:
                self.need_to_be_scraped = False

    def get_page(self):
        """
        The url is requested with a GET request. The page html is scraped
        directly, while elements of it aee scraped in parse_page
        """
        self.headers = {'User-agent':settings.SPIDER_USER_AGENT}
        #REFACTOR to remove try
        try:
            self.request = requests.get(self.url, headers=headers)
            self.pagehtml = BeautifulSoup(self.request.text) #REFACTOR, don't use BeautifulSoup
            self.count = self.instanceID.next()
            Webpage.number_of_scraped_pages += 1
        except:
            raise Exception

    def get_visible_elements(self, element):
        """
        Checks that the element is not contained in <style>, <script>, <head>,
        <title> or [document]. It also cannot be commented out.
        """
        if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
            return False
        elif re.match('<!--.*-->', str(element)):
            return False
        return True

    def parse_page(self):
        """
        This method parses the HTML page and extracts the title of the page,
        the outgoing links, the number of outgoing links, and the text.
        """
        self.title = self.pagehtml.find('title').text
        self.page_text = self.pagehtml.findAll(text=true)

        for item in filter(get_visible_elements, self.pagetext):
            if item != '\n':
                self.pagetext+= item
        self.pagelinks = {}

        for link in soup.findAll('a'):
            self.pagelinks[link.get('href')] = 1

        for link in self.pagehtml:
            pass

        # determine if link is relative or absolute. if relative, change it to absolute

    def inverted_index_page_text(self):
        """
        Iterates through the words in the page text and creates and adds them
        to an index.
        """
        self.pagetextlist = self.pagetext.split(' ') #Noted error: This catches punctuation along with words.
        for index, word in enumerate(self.pagetextlist):
            if word not in STOP_WORDS:
                if not inverted_index.get(word):
                    inverted_index[word]={'url':self.url,'offsets':[index]}
                else:
                    inverted_index[word]['offsets'].append(index)

    def set_page_scraped(self):
        """
        Once the page is scraped it is flagged as such
        """
        self.needs_to_be_scraped = False
Пример #24
0
class SiteMap():
    def __init__(self, main_page=None, robotrules=True):
        """
            Constuctor method that initializes the members that are used during crawling process
        :param main_page: The root page that needs to be crawled for generation of sitemap
        """

        logging.info("Consider Robot.txt ? ==> " + str(robotrules))
        self.robotrules = robotrules
        self.site_map = {
        }  # map that records the visits of urls, datemodified and assets
        self.network = {
        }  # map that maintains the network/graph of webpages visited
        # The intention of this map is for visual rendering using d3.js

        self.unvisited = set(
            [])  # a set to keep the list of urls yet to be visited
        self.start_page = None  # the root page, this is used to avoid cycle and keeping crawl
        # process limited to single domain.
        self.robot_txt_rules = None

        if main_page:
            self.unvisited.add(main_page)
            try:
                self.start_page = urlparse(main_page).netloc
            except:
                logging.error("Improper URL, Please provide a Valid Url:" +
                              main_page)
                exit(0)

        if self.robotrules == "True":
            try:
                logging.info("robot.txt respected")
                self.robot_txt_rules = RobotFileParser()
                self.robot_txt_rules.set_url(main_page + "/robots.txt")
                self.robot_txt_rules.read()
            except:
                logging.error("Unable to read the robot.txt file")
                self.robotrules = False  # error reading robot.txt, ignore it forever

    @timeit
    def generate(self, site_map=None):
        """
            This method holds the invoking control of the crawler method and drives the crawling process.
            Basically a BFS style method that keeps popping the elements from the queue [self.unvisited set]
            and scraping the urls.

            Once the crawling process is done, this creates sitemap using the self.site_map dictionary with
            just url, date-modified tags with dummy frequency and priorities.
        :param site_map: name of the site_map file so as to create xml entries.
        :return:
        """
        while self.unvisited:
            self.crawl()
        # create xml from the site_map dictionary
        header = """<?xml version="1.0" encoding="UTF-8"?>
            <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
            xmlns:xhtml="http://www.w3.org/1999/xhtml"
            xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
            xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9
            http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">
        """
        footer = """\n</urlset>\n"""
        entry = "\t<url>\n\
                 \t\t<loc>%s</loc>\n\
                 \t\t<lastmod>%s</lastmod>\n\
                 \t\t<changefreq>monthly</changefreq>\n\
                 \t\t<priority> 1 </priority>\n\
                 \t</url>\
        "

        xml = header
        for url in self.site_map.keys():
            xml += entry % (url, self.site_map[url]['date']) + "\n"

        xml += footer
        if site_map != None:
            self.write_to_file(site_map, xml)
        else:
            self.write_to_file("sitemap.xml", xml)
        return xml

    def write_to_file(self, file_name, content):
        """
            A utility method to just write the contents of the file into a given file name.
            Alert: This overwrites if the file does exist in the current directory.
        :param file_name: name of the file, sitemap in our case.
        :param content:   contents of the file
        :return: None
        """
        f = open(file_name, 'w')
        f.write(content)
        f.close()

    def compose_url_from_href(self, url, href):
        """
            There are different ways a href could specify a location and it varies in different ways based on how
            the page is designed. This method takes few styles into consideration and ignores some, cleans and creates
            a valid url link so as to keep it ready for the crawl method.
        :param url:   basae url of the current page
        :param href:  one of the hyper links of the page
        :return:      a well formed and valid http link
        """
        if href.startswith('/'):
            return "http://%s%s" % (url.netloc, href)
        elif href.startswith('#'):
            return "http://%s%s%s" % (url.netloc, url.path, href)
        elif href.startswith('./'):
            return "http://%s%s" % (url.netloc, href[1:])
        elif not href.startswith('http'):
            return "http://" + url.netloc + '/' + href

        return href

    def get_out_going_edges(self, url, html_body):
        """
            This method encompasses the BFS along with the coupling with crawl and generator as it changes the state
            of the unvisited map. Basically this method extracts the links that belong to the same domain as the start
            page, cleans them with compose_url_from_href method and updates the map. This also avoids unnecessary traps
            like href links pointing to 'javascript', 'mailto' etc.
        :param url:         current page url
        :param html_body:   current page's html content
        :return:            returns all the valid and wellformed out going links from this page
        """
        soup = BeautifulSoup(html_body, "html.parser")
        valid_links_for_this_page = []
        for a in soup.find_all('a', href=True):

            href = a['href']
            href = self.compose_url_from_href(url, href.decode("utf-8"))

            # clean the href so that it will have legitimate urls instead of #cluttered ones and q=param prints
            href = urldefrag(href)[
                0]  # skip intra links [this took time to find out !] ##1
            # remove query params as only the path matters
            if href.find('?') != -1:
                href = href[:href.find('?')]  ##2

            new_page = urlparse(href)

            # add to the queue only it it doesn't cause a cycle
            # assumption: if a link ends with domain.com, assuming it can be crawled to make sitemap complete
            if not str(new_page.netloc).endswith(
                    self.start_page):  # doesn't belong to domain
                continue

            if  self.robot_allows(href) and \
                not href in self.site_map.keys()            and \
                not href in self.unvisited                  and \
                not 'javascript:' in href                   and \
                not 'mailto:' in href:
                self.unvisited.add(href)
                valid_links_for_this_page.append(href)

        return valid_links_for_this_page

    def record_visit(self, url, headers, html_body):
        """
            Any time a specific url of a site is changed, its last-modified date and time are kept in the page headers.
            This info helps bots and crawlers to not to crawl the page if it has not been updated since last crawl.
            This method is used to preserve the url crawled and its last-modified time along with assets scraped into
            the container dictionary for later usage to generate sitemap and visualization network.
        :param url:         url of the just finished crawling page
        :param headers:     header information of the crawled page
        :param html_body:   html content of the page
        :return:            None
        """
        if 'last-modified' in headers:
            date = headers['Last-Modified']
        else:
            date = headers['Date']

        self.site_map[url] = {
            'date': date,
            'assets': self.get_static_assets(html_body)
        }

    def get_static_assets(self, html_body):
        """
            A html page could contain other links such as .css, .img. .mp4 and .js. All these files are not dynamic
            though they could produce dynamic results. The code or text that exists in these files is constant and
            static. These files are referred as static assets and for the definition of this challenge, I have chosen
            to keep all the info in a single dictionary and extract them at the end for reports, results and stats.
        :param html_body:       html content of the page.
        :return:                returns a dictionary that encompasses .css, .img, ijs files as lists.
        """
        # add static assets of the page .css, .js and image urls may be ?
        soup = BeautifulSoup(html_body, "html.parser")
        img = soup.findAll("img")
        css = soup.findAll("link", {"rel": "stylesheet"})

        # js is tricky: I faced an issue with inline javascript and ignoring it for the time being.
        # an extract like html_body with just needed parts is a must for excluding inline scripts and styles.
        jss = []
        for x in soup.findAll('script'):
            try:
                list.append(x['src'])
            except KeyError:
                pass

        csss = []
        imgs = []
        jss = []
        for link in css:
            csss.append(link['href'])
        for link in img:
            imgs.append(link['src'])
        for link in jss:
            jss.append(link['src'])

        return {'css': csss, 'img': imgs, 'js': jss}

    def crawl(self):
        """
            The main driver method that crawls the pages. This main does below steps:
            for every unvisited [vertex|page] that belongs to the requested domain:
                crawl the page
                record valid links and their last-modified-dates
        :return:   None
        """
        page = self.unvisited.pop()
        # if robot.txt is defined, use Disallow to avoid pages. domain.robot.txt doesn't exist so the crawler
        # must find all the pages for report.
        logging.info("Starting to Crawl Page: " + page)

        url = urlparse(page)
        try:
            response = urlopen(page)
        except:
            logging.debug("Issue with the url: " + page)
            return None
        try:
            html_body = response.read()  # response.getcode()
            response.close()
            # record visit ans assets
            self.record_visit(page, response.headers, html_body)
            logging.debug("Queued Pages: {0}, Crawled Pages: {1}".format(
                len(self.unvisited), len(self.site_map)))
        except:
            logging.debug("Issue while opening url: " + page)
            return None
        connects = self.get_out_going_edges(url, html_body)

        # simple Graph that keeps the order of the pages crawled.
        for i, url in enumerate(connects):
            self.network[page] = {
                'to': connects,
                'assets': {
                    'css': self.site_map[page]['assets']['css'],
                    'js': self.site_map[page]['assets']['js'],
                    'img': self.site_map[page]['assets']['img']
                }
            }
        return None

    def get_site_map(self):
        """
            Returns the compiled sitemap structure
        :return:       sitemap data structure
        """
        return self.site_map

    def get_network_graph(self):
        """
            Returns the compiled network in the order of the crawled pages
        :return:       network graph
        """
        return self.network

    def get_network_json_format(self):
        """
            Returns the crawl traverse order sequence in json format
        :return:       network in json format
        """
        return json.dumps(self.network)

    def set_start_page(self, url):
        """
            This could be useful if one is testing
        :param url: start page to start the crawling.
        :return:
        """
        self.start_page = url

    def robot_allows(self, link):
        if not self.robotrules: return True
        try:
            if self.robot_txt_rules.can_fetch("*", link):
                return True
            return False
        except:
            return True
Пример #25
0
class MarioDepth:
    def __init__(self, starturl, callback, callpre=None, callfail=None, concount=MAXCONCOUNT, depth=2, accept_url_patterns=None, reject_url_patterns=None):
        self.concount = concount
        self.callback = callback
        self.callpre = callpre
        self.callfail = callfail
        self.depth = depth
        self.starturl = starturl
        self.baseurl = URL.baseurl(starturl)
        self.urls = []
        self.crawled = {}
        self.link_title_db = LinkTitleDB()
        self.accept_url_patterns = accept_url_patterns
        self.reject_url_patterns = reject_url_patterns
        self.robotstxt = RobotFileParser()
        self.robotstxt.set_url(urljoin(starturl, '/robots.txt'))
        self.referer = starturl
        try:
            self.robotstxt.read()
        except:
            logger.debug(Traceback())
        #self.lightcloud = LightCloud.connect('n0')
    
    def __call__(self, n=None):
        if n: self.concount = n
        current_depth = self.depth
        self.urls.append((self.starturl, current_depth))
        while self.urls:
            self.depth_get()
            logger.debug('%d unprocessed urls'%(len(self.urls)))
    
    def depth_get(self):
        mario = MarioBatch(callback=self.next_depth, callpre=self.callpre, callfail=self.callfail)
        pool = coros.CoroutinePool(max_size=len(self.urls))
        while self.urls:
            waiters = []
            #self.add_job(mario)
            counter = 0
            while self.urls:
                if counter > 9: break;
                counter += 1
                waiters.append(pool.execute(self.add_job, mario))
            logger.debug('Depth break')
            for waiter in waiters:
                waiter.wait()
            mario(self.concount)
    
    def add_job(self, mario):
        if not self.urls: return
        url, depth = self.urls.pop()
        if self.visited(url, depth): return
        mario.add_job(url, args=depth)
        
    def visited(self, url, depth):
        #is_duplicate = URL.is_duplicate(url, self.lightcloud)
        return depth==0 and is_duplicate or depth < self.depth and self.crawled.has_key(url) and self.crawled[url] == 2
    
    def next_depth(self, response):
        #with_timeout(1, self.lightcloud.set, LightCloud.crawled_url_key(response.effective_url), response.url, timeout_value=None)
        for link, title in URL.link_title(response.body, response.effective_url):
            if not self.inject_url(link, response.args):continue
            self.link_title_db.add(link, response.effective_url, title)
        if callable(self.callback): self.callback(response)
        self.crawled[response.effective_url] = 2
        if response.effective_url != response.url:
            self.crawled[response.url] = 2
        self.referer = response.effective_url
    
    def inject_url(self, url, depth):
        if not (depth and url and url not in self.crawled): 
            #logger.debug('IGNORE(%d): %r'%(depth, url))
            return None
        if isinstance(url, unicode): url = url.encode('utf-8')
        if self.reject_url(url): 
            logger.debug('REJECT: %r' % url)
            return None
        try:
            can_fetch = self.robotstxt.can_fetch(USER_AGENT['safari'], url)
        except:
            can_fetch = True
        if self.baseurl!='http://hi.baidu.com/' and not can_fetch:
            logger.debug('DISALLOW: %r' % url)
            return None
        logger.debug('INJECT(%d): %r' % (depth-1, url))
        self.crawled[url] = 1
        self.urls.append((url, depth-1))
        return True
    
    def reject_url(self, url):
        return self.baseurl != URL.baseurl(url) and (not self.accept_url_patterns or not re.match('|'.join(self.accept_url_patterns), url) or self.reject_url_patterns or re.match('|'.join(self.reject_url_patterns), url))
        
Пример #26
0
class spider(object):
    CurLink = ""
    linkURI = []
    texts = []
    Meta = {}

    def __init__(self, link):
        self.CurLink = link
        self.r = RobotFileParser()

    def crawl(self):
        self.r.set_url(urlparse.unquote(self.CurLink))
        self.r.read()

        self.html = urlopen(self.CurLink).read()
        self.bs = BeautifulSoup(self.html, "lxml")

        for script in self.bs(["script", "style"]):
            script.extract()
        text = self.bs.get_text()
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines
                  for phrase in line.split("  "))
        for chunk in chunks:
            if chunk:
                self.texts.append(chunk)

        # site = urlparse.urlsplit(self.CurLink).scheme + "://" + urlparse.urlsplit(self.CurLink).netloc + "/sitemap.aspx"
        # r = requests.get(site)
        if requests.get(
                urlparse.urlsplit(self.CurLink).scheme + "://" +
                urlparse.urlsplit(self.CurLink).netloc +
                "/sitemap.aspx").ok == True:
            root = etree.fromstring(
                requests.get(
                    urlparse.urlsplit(self.CurLink).scheme + "://" +
                    urlparse.urlsplit(self.CurLink).netloc +
                    "/sitemap.xml").content)
            for sitemap in root:
                children = sitemap.getchildren()
                self.linkURI.append(children[0].text)
        elif requests.get(
                urlparse.urlsplit(self.CurLink).scheme + "://" +
                urlparse.urlsplit(self.CurLink).netloc +
                "/sitemap.xml").ok == True:
            root = etree.fromstring(
                requests.get(
                    urlparse.urlsplit(self.CurLink).scheme + "://" +
                    urlparse.urlsplit(self.CurLink).netloc +
                    "/sitemap.xml").content)
            for sitemap in root:
                children = sitemap.getchildren()
                self.linkURI.append(children[0].text)
        else:
            for link in self.bs.findAll('a', href=True):
                aLink = urlparse.urljoin(self.CurLink, link['href'])

                if (self.r.can_fetch("*", aLink)):
                    self.linkURI.append(aLink)

        page = metadata_parser.MetadataParser(url=self.CurLink)
        meta = page.metadata

        keyw = "null"
        descr = "null"
        if (meta.get('meta').get('Keywords')):
            keyw = meta['meta']['Keywords'].split(', ')

        if (meta.get('meta').get('Description')):
            descr = meta['meta']['Description']

        self.Meta = {
            'title': meta['page']['title'],
            'url': meta['_internal']['url_actual'],
            'description': descr,
            'keyword': keyw
        }
class SimpleCrawler:

    USER_AGENT = 'SimpleCrawler/0.1'
    HEADERS = {
        'User-Agent': USER_AGENT,
        'Accept-Encoding': 'gzip',
        'Connection': 'keep-alive'
    }
    CONTENT_TYPE_PAT = re.compile(r'([^\s;]+)(.*charset=([^\s;]+))?', re.I)

    def __init__(self,
                 starturl,
                 index_html='',
                 maxlevel=1,
                 cookie_file=None,
                 acldb=None,
                 urldb=None,
                 default_charset=None,
                 delay=0,
                 timeout=300,
                 debug=0):
        (proto, self.hostport, _x, _y, _z) = urlsplit(starturl)
        # assert proto == 'http'
        #Thread.__init__(self)
        self.debug = debug
        self.index_html = index_html
        if cookie_file:
            self.cookiejar = MozillaCookieJar(cookie_file)
            self.cookiejar.load()
        else:
            self.cookiejar = None
        self.robotstxt = RobotFileParser()
        self.robotstxt.set_url(urljoin(starturl, '/robots.txt'))
        # self.robotstxt.read()
        self.conn = None
        self.urldb = urldb
        self.acldb = acldb
        self.curlevel = 0
        self.delay = delay
        self.timeout = timeout
        self.default_charset = default_charset
        if starturl.endswith('/'):
            starturl += self.index_html
        self.urls = [(starturl, maxlevel)]
        self.crawled = {}  # 1:injected, 2:crawled
        return

    def accept_url(self, url):
        if url.endswith('/'):
            url += self.index_html
        if self.acldb and not self.acldb.allowed(url):
            return None
        return url

    def inject_url(self, url):
        if (not self.curlevel) or (not url) or (url in self.crawled):
            return False
        if not self.robotstxt.can_fetch(self.USER_AGENT, url):
            if self.debug:
                print >> stderr, 'DISALLOW: %r' % url
            return None
        if self.debug:
            print >> stderr, 'INJECT: %r' % url
        self.crawled[url] = 1
        self.urls.append((url, self.curlevel - 1))
        return True

    def get1(self, url, maxretry=5, maxredirect=5):
        if self.debug:
            print >> stderr, 'GET: %r' % url
        # loop
        for rtry in range(maxredirect):
            # forge urllib2.Request object.
            req = Request(url)
            # add cookie headers if necessary.
            if self.cookiejar:
                self.cookiejar.add_cookie_header(req)
                headers = req.unredirected_hdrs
                headers.update(self.HEADERS)
            else:
                headers = self.HEADERS
            # get response.
            for ctry in range(maxretry):
                try:
                    if not self.conn:
                        print >> stderr, 'Making connection: %r...' % (
                            self.hostport, )
                        self.conn = HTTPConnection(self.hostport)
                    self.conn.request('GET',
                                      req.get_selector().replace(' ', ''), '',
                                      headers)
                    # self.conn.sock.settimeout(self.timeout)
                    resp = self.conn.getresponse()
                    break
                except BadStatusLine, x:
                    # connection closed unexpectedly
                    print >> stderr, 'Connection closed unexpectedly.'
                    # it restarts the connection...
                    self.conn.close()
                    self.conn = None
                except socket.error, x:
                    # connection closed unexpectedly
                    print >> stderr, 'Socket error:', x
                    self.conn.close()
                    self.conn = None
            else:
Пример #28
0
	def test_parse(self):
		from robotparser import RobotFileParser
		rules=RobotFileParser()
		rules.set_url("http://www.sogou.com/robots.txt")
		rules.read()
		self.assertEqual(rules.can_fetch("mozilla","http://www.sogou.com/sohu/robots.txt"),False)