Exemplo n.º 1
0
    def __init__(self, seeds):

        self.urlQueue=UrlQueue()

        if isinstance(seeds, str):
            self.urlQueue.addUnvisitedUrl(seeds)

        elif isinstance(seeds, list):
            for i in seeds:
                self.urlQueue.addUnvisitedUrl(i)

        print yellow("Add seeds {unvisitedUrl} site.".format( unvisitedUrl = \
            str(self.urlQueue.unVisited).lstrip('[').rstrip(']')))
Exemplo n.º 2
0
    def __init__(self, frame, root_url, thread_num, crawl_depth,
                 login_session):
        self.frame = frame
        self.root_url = root_url

        SpiderMain.thread_num = thread_num
        SpiderMain.wait_thread_num = thread_num
        SpiderMain.stoped = False
        self.con = threading.Condition()

        self.url_queue = UrlQueue(root_url)
        self.outputer = html_outputer.HtmlOutputer()
        self.crawl_depth = crawl_depth
        self.login_session = login_session

        self.threads = []
Exemplo n.º 3
0
                            json_key = int(value[0].get('index'))
                        result[key] = json_response[json_key]
            result.update({'status': 'A'})
            Videos.objects.filter(id=url.id_indb).update(**result)


if __name__ == '__main__':
    import os

    os.environ.setdefault("DJANGO_SETTINGS_MODULE", "spiderx.settings")
    import time
    from crawler_models import Url
    import django

    django.setup()
    queue = UrlQueue()
    # queue.put(Url(url='http://www.youku.com', url_tip='youku', url_type='host')) # 235
    # queue.put(Url(url='http://www.acfun.tv', url_tip='acfun', url_type='host')) # 129
    # queue.put(Url(url='http://tv.sohu.com', url_tip='sohutv', url_type='host')) # 299
    # queue.put(Url(url='http://www.bilibili.com', url_tip='bilibili', url_type='host'))
    # queue.put(
    # Url(url='http://v.youku.com/v_show/id_XODk0NjM0Nzcy.html', url_tip='youku', url_type='video',
    #         id_indb=257))
    queue.put(Url(url='http://tv.sohu.com/20150306/n409440063.shtml', url_tip='sohutv', url_type='video', id_indb=7200))
    # for i in range(10):


    job = Job(0)
    # queue.put(Url(url='http://tv.sohu.com/20150221/n409106225.shtml', url_tip='sohutv', url_type='video'))

    start = time.time()
Exemplo n.º 4
0
class Crawler:

    def __init__(self, seeds):

        self.urlQueue=UrlQueue()

        if isinstance(seeds, str):
            self.urlQueue.addUnvisitedUrl(seeds)

        elif isinstance(seeds, list):
            for i in seeds:
                self.urlQueue.addUnvisitedUrl(i)

        print yellow("Add seeds {unvisitedUrl} site.".format( unvisitedUrl = \
            str(self.urlQueue.unVisited).lstrip('[').rstrip(']')))

    def process(self):

        visitUrl = self.urlQueue.unVisitedUrlDeQuence()
        print yellow("\nProcessing Url: {url}".format(url=visitUrl))

        print "Get Hyper Links:"
        links = self.getHyperLinks(visitUrl)
        print yellow("\nGet {linkno} new links".format(linkno = len(links)))

        print yellow("\nAdding url: {url} to VisitedQueue".format(url=visitUrl))
        self.urlQueue.addVisitedUrl(visitUrl)

        print yellow("\nVisited url count: {count}".format(count = \
            str(self.urlQueue.getVisitedUrlCount())))

        for link in links:
            print "\nAdding new link {link_name} to UnvisitedUrlQueue."\
                .format(link_name = green(link))
            self.urlQueue.addUnvisitedUrl(link)

        print "\nHave %d unvisited links" % len(self.urlQueue.getUnvisitedUrl())

    def crawling(self, crawl_count):

        flag = True
        if crawl_count is 0:
            flag = False
        elif crawl_count >= 0:
            flag = True
        else:
               raise ValueError("Fetal error : crawler_count->{num} number is invalid.".format(num=str(crawler_count)))

        try:

            if flag:
                while not self.urlQueue.unVisitedUrlsEmpty() and self.urlQueue.getVisitedUrlCount() <= crawl_count-1:
                    self.process()
            else:
                while not self.urlQueue.unVisitedUrlsEnmpy():
                    self.process()
        except Exception, e:
            print "process error:", e
            traceback.print_exc()
            sys.exit(1)

       ##print self.urlQueue.getUnvisitedUrl()
        print self.urlQueue.getVisitedUrlCount()
        print self.urlQueue.getVisitedUrl()