예제 #1
0
class Crawler:
    fileno = 0
    total_data_downloaded = 0
    total_status_200 = 0
    total_status_401 = 0
    total_status_404 = 0
    total_status_500 = 0

    def __init__(self, max_links_allowed, compress_status):
        self.__html_parser = Parser()
        self.__bfs_tree = Queue()
        self.__unique_links = Dictionary(max_links_allowed)
        self.__compress = compress_status
        self.__pyurlopener = lib.PyURLOpener()
        self.__start_time = datetime.now()

    def format_validate_and_save_links(self, links_to_crawl, base_link, depth):
        links_to_crawl = self.__remove_duplicates(links_to_crawl)

        for link in links_to_crawl:
            link_to_process = Link(link, depth)
            link_to_process.fix_relative_link(base_link)
            link_to_process.set_link_attrs()
            link_to_process.remove_duplicate_link(self.__unique_links)
            link_to_process.remove_invalid_link()
            link_to_process.remove_uncrawlable_link()
            self.__save_link(link_to_process)

    def crawl(self):
        next_link = self.__next_url()
        links_to_crawl = self.__html_parser.get_links(next_link.redirected)
        return {'links_to_crawl': links_to_crawl, 'next_link': next_link.redirected, 'depth': (next_link.depth + 1)}

    def clear(self):
        self.__html_parser.clear()

    def display(self):
        print self.__unique_links.all()

    def get_num_links_saved(self):
        return self.__unique_links.size()

    # PRIVATE

    def __remove_duplicates(self, links_to_crawl):
        return list(set(links_to_crawl))

    def __save_link(self, link):
        if link.redirected is not None:
            insert_status = self.__unique_links.insert(link.normalized, link.redirected)
            self.__bfs_tree.enqueue(link)
    
            crawled_urls = open("data/crawled_urls.txt", "a+")
            
            if insert_status:
                file_size = self.__save_page(link)
                self.__save_url(link, file_size, crawled_urls)
            else:
                crawled_urls.write(self.__stats())
                crawled_urls.close
                sys.exit()
            
            crawled_urls.close

    def __save_page(self, link):
        new_file = open("data/pages/" + str(self.fileno) + ".html", "w")
        page_html = link.opened.read()

        if page_html is None:
            page_html = "Link [" + link.redirected + "]: HTML not retrieved"
        else:
            page_html = link.redirected + "\n\n" + page_html
        
        if self.__compress:
            new_file.write(zlib.compress(page_html))
        else:
            new_file.write(page_html)

        new_file_size = new_file.tell()
        self.__update_stats(new_file_size, link.code)

        link.opened.close()
        new_file.close

        return new_file_size

    def __save_url(self, link, file_size, crawled_urls):
        crawled_urls.write("[" + time.asctime(time.localtime(time.time())) + "][" + self.__to_mb(file_size) + " MB][" + str(link.code) + "][" + str(link.depth) + "] " + link.redirected + "\n")

    def __next_url(self):
        return self.__bfs_tree.dequeue()

    def __stats(self):
        time_taken = datetime.now() - self.__start_time
        
        stats =  "\nCrawl Stats:"
        stats += "-------------------------------------\n"
        stats += "- Pages crawled:   " + str(self.__unique_links.size()) + " pages\n" 
        stats += "- Data downloaded: " + self.__to_mb(self.total_data_downloaded) + " MB\n"
        stats += "- Time Taken:      " + str(time_taken.seconds) + " seconds\n"
        stats += "- HTTP 200 count:  " + str(self.total_status_200) + "\n"
        stats += "- HTTP 401 count:  " + str(self.total_status_401) + "\n"
        stats += "- HTTP 404 count:  " + str(self.total_status_404) + "\n"
        stats += "- HTTP 500 count:  " + str(self.total_status_500)
        return stats
    
    def __update_stats(self, new_file_size, code):
        self.fileno += 1
        self.total_data_downloaded += new_file_size
        
        if code == 200:
            self.total_status_200 += 1
        elif code == 401:
            self.total_status_401 += 1
        elif code == 404:
            self.total_status_404 += 1
        elif code == 500:
            self.total_status_500 += 1

    def __to_mb(self, bytes):
        return '%.3f' % (float(bytes) / 1048576)
예제 #2
0
"""
This script just for test and learn how to crawl web pages using python

"""

from parser import Parser
from crawl import Crawl

c = Crawl()
c.fetch('http://www.blogfa.com/')

p = Parser()
p.set_html(c.content)

p.get_title()
p.get_links()

print "count of links: %s" % len(p.links)
print "title of current url: %s" % p.title