示例#1
0
class mySpider(object):
    def __init__(self):
        self.will_crawl = []
        self.visted = set([])
        self.parser = HTMLParser()

    def crawl(self, initial_url, maxFiles):
        # standardize url so that it doesn't end in '/', get rid of '#', and starts with http://
        redundant = initial_url.find('#')
        if redundant != -1:
            initial_url = initial_url[:redundant]

        length = len(initial_url)
        if initial_url[length - 1] == '/':
            initial_url = initial_url[:(length - 1)]

        if initial_url[0:4] != "http":
            initial_url = "http://" + initial_url

        # Add in check for robots.txt to insure it is polite
        initdomain = "{uri.netloc}".format(
            uri=urllib.parse.urlparse(initial_url))
        print("domain is " + initdomain)
        robot = urllib.robotparser.RobotFileParser()
        robot.set_url("http://" + initdomain + "/robots.txt")
        print("check 1")
        robot.read()
        print("check 2")
        if (robot.can_fetch("*", initial_url)):
            self.will_crawl.append(
                initial_url)  # put initial_url to will_crawl list if allowed

        filenum = 1  # initialize number of files downloaded
        directory = "../file_cache/unprocessed/newly_crawled/"
        if not os.path.exists(directory):
            os.makedirs(directory)

        while (len(self.will_crawl) > 0) and (filenum <= maxFiles):

            url = self.will_crawl.pop(0)  # get next url
            try:
                print("Spider at:", url)
                openedURL = urlopenFun(url)
                sourceCode = openedURL.read()
                encoded = urllib.parse.quote(
                    str(url), safe='.')  # encode URL so it can be a file name
                encoded = encoded.replace(".", "%2E")

                try:
                    name = '{0}/{1}.html'.format(directory, encoded)
                    f = open(name, 'wb')
                    f.write(sourceCode)
                    filenum = filenum + 1  # keeps track of files downloaded if needed
                    f.close()
                except:
                    print("\tFile Exception\t{0}".format(name))

                links = self.parser.findLinks(url)  # parse url
                self.visted.add(url)  # mark url as visted

                # Add links to will_crawl list if not visited already and if robots.txt says is polite
                for url in links:
                    # print (robot.can_fetch("*", url))
                    try:
                        if (url not in self.visted) and (
                                url
                                not in self.will_crawl) and (robot.can_fetch(
                                    "*", url)):
                            self.will_crawl.append(url)
                    except:
                        self.visted.add(url)
                        print("Some error occurred adding to frontier.")
            except Exception as e:
                self.visted.add(url)
                print(e)