예제 #1
0
class Crawler():

    def __init__(self ):
        self.downloader = DownloadManager()
        self.webpage = None
        self.init_database()
        self.rules = {}

    def init_database(self):
        self.queue = QueueDB('queue.db')
        self.webpagedb = WebpageDB('webpage.db')
        self.duplcheck = DuplCheckDB('duplcheck.db')
    
    def add_seeds(self, links):
        new_links = self.duplcheck.filter_dupl_urls(links)
        self.duplcheck.add_urls(new_links)
        self.queue.push_urls(new_links)
    
    def add_rules(self, rules):
        self.rules = {}
        for url, inurls in rules.items():
            reurl = re.compile(url)
            repatn = []
            for u in inurls:
                repatn.append(re.compile(u))
            self.rules[reurl] = repatn

    def get_patterns_from_rules(self,url):
        patns = []
        for purl,ru in self.rules.items():
            if purl.match(url)!= None:
                patns.extend(ru)
        return list(set(patns))

    def start(self):
        while 1:
            url = self.queue.pop_url()
            print url
            if url == None:
                print "crawling task is done."
                break
            error_msg, url, redirected_url, html = self.downloader.download(url)
            #print error_msg, url, redirected_url, html
            if html !=None:
                self.webpagedb.html2db(url,html)
                
                self.webpage = WebPage(url,html)
                self.webpage.parse_links()
                ruptn = self.get_patterns_from_rules(url)
                print ruptn
                links = self.webpage.filter_links(tags = ['a'], patterns= ruptn)
                self.add_seeds(links)
            self.mysleep(3)        

    def mysleep(self, n):
        for i in range(n):
            time.sleep(1)
            print "sleep",i,"of",n
예제 #2
0
파일: crawler.py 프로젝트: catsky/auCrawler
class Crawler():
    def __init__(self):
        self.downloader = DownloadManager()
        self.webpage = None
        self.rules = {}
        self.dbop = OperatorDB()

    def add_seeds(self, links):
        self.dbop.add_seeds(links)

    def add_rules(self, rules):
        self.rules = {}
        for url, inurls in rules.items():
            reurl = re.compile(url)
            repatn = []
            for u in inurls:
                repatn.append(re.compile(u))
            self.rules[reurl] = repatn

    def get_patterns_from_rules(self, url):
        patns = []
        for purl, ru in self.rules.items():
            if purl.match(url) != None:
                patns.extend(ru)
        return list(set(patns))

    def start(self):
        while 1:
            try:
                url = self.dbop.pop_url()
                print "url: %s" % url
                if url == None:
                    print "crawling task is done."
                    break
                error_msg, url, redirected_url, html = self.downloader.download(url)
                #print error_msg, url, redirected_url, html
                if html != None:
                    self.webpage = WebPage(url, html)
                    article = self.webpage.extract()
                    if len(article) > 5:
                        addtime = "%s %s" % (article[1], article[2])
                        self.dbop.html2db(url, html,
                                          article[0],
                                          addtime,
                                          article[3],
                                          article[5])
                    else:
                        self.dbop.html2db(url, html)
                    print self.webpage.parse_links()
                    ruptn = self.get_patterns_from_rules(url)
                    links = self.webpage.filter_links(tags=['a'],
                                                      str_patterns=ruptn)
                    self.add_seeds(links)
                self.mysleep(3)
            except Exception, err:
               print "!!error!! Exception happend! %s %s" % (url, err)
               self.dbop.close()
예제 #3
0
        form.action = urlparse.urljoin(self.url, form.action)
        return form.action, form.fields

    #
    def get_html(self):
        return self.html


if __name__ == "__main__":
    import time
    from downloader import DownloadManager

    downloader = DownloadManager()

    url = "http://www.cs.colorado.edu/"
    error_msg, url, redirected_url, html = downloader.download(url)
    print error_msg, url, redirected_url, len(html)
    time.sleep(2)

    page = WebPage(url, html)
    page.parse_links()
    links = page.filter_links(tags=["a"], str_patterns=["^(http://www\.cs\.colorado\.edu)(/info.+)$"])

    elements = page.doc.findall("./body//div")
    for e in elements:
        print "ELEMETNS =========================================="
        print lxml.html.tostring(e, pretty_print=True)
        print "ITEMS------------------------------------------"
        print e.items()
        print "TEXT-CONTENT-----------------------------------"
        print e.text_content()
예제 #4
0
        form = self.doc.forms[index]
        form.action = urlparse.urljoin(self.url, form.action)
        return form.action, form.fields

    #
    def get_html(self):
        return self.html

    
if __name__ == "__main__":
    import time
    from downloader import DownloadManager
    downloader = DownloadManager()

    url = "http://www.cs.colorado.edu/"
    error_msg, url, redirected_url, html = downloader.download(url)
    print error_msg, url, redirected_url, len(html)
    time.sleep(2)

    page = WebPage(url, html)
    page.parse_links()
    links = page.filter_links(tags=['a'],patterns=['^(http://www\.cs\.colorado\.edu)(/info.+)$'])

    elements = page.doc.findall('./body//div') 
    for e in elements:
        print "ELEMETNS =========================================="
        print lxml.html.tostring(e,pretty_print=True)
        print "ITEMS------------------------------------------"
        print e.items()
        print "TEXT-CONTENT-----------------------------------"
        print e.text_content()
예제 #5
0
class Server(object):
    SHUTDOWN_TIMEOUT = 60

    def __init__(self):

        self._cfg = _load_cfg()

        try:
            with open(os.path.join(APP_PATH, 'data.json')) as json_fp:
                user_data = json.load(json_fp)
        except IOError:
            logging.warning('No user data')
            user_data = {}

        self._download_manager = DownloadManager(self._cfg['user'],
                                                 self._cfg['password'],
                                                 user_data.get('session_id'))

    def stop(self):
        self._bottle_server.stop()


    def _on_heartbeat_timeout(self, heartbeat):
        '''
        Called if we haven't had a heartbeat in a while
        '''
        if any(self._download_manager.get_downloading()):
            logging.debug('No heartbeat but downloading.... Still alive')
            heartbeat.beat()
        else:
            logging.debug('No heartbeat, no downloads. Stopping...')
            self._bottle_server.stop()

    def _on_dl_finished(self, path, file_name):
        '''
        TODO this needs sorting
        '''
        import shutil
        save_path = os.path.join(self._cfg['save_location'], file_name)
        logging.debug('Moving <%s> to <%s>' % (path, save_path,))
        shutil.move(path, save_path)

    def start_download(self, game, game_id, mod_id, file_id):
        self._download_manager.download(self._on_dl_finished,
                                        game,
                                        game_id,
                                        mod_id,
                                        file_id)

    def start_server(self, host, port):
        self._bottle_server = StoppableWSGIRefServer(host=host, port=port)

        hb = Heartbeat()
        install(partial(local_variable_plugin, {
            'cfg':self._cfg,
            'heartbeat': hb,
            'server' : self,
            'download_manager' : self._download_manager,
        }))

        hb_monitor = HeartbeatMonitor(hb, self.SHUTDOWN_TIMEOUT, self._on_heartbeat_timeout)
        hb_monitor.monitor()

        run(server=self._bottle_server)

        self._download_manager.stop()
        hb_monitor.stop()
        _save_cfg(self._cfg)
예제 #6
0
class Crawler(object):
    def __init__(self):
        super(Crawler, self).__init__()
        self.downloader = DownloadManager()
        self.webpage = None
        self.init_database()
        self.rules = {}

    def init_database(self):
        self.queue = QueueDB('queue.db')
        self.webpagedb = WebpageDB('webpage.db')
        self.duplcheck = DuplCheckDB('duplcheck.db')

    def add_seeds(self, links):
        new_links = self.duplcheck.filter_dupl_urls(links)
        self.duplcheck.add_urls(new_links)
        self.queue.push_urls(new_links)

    def add_rules(self, rules):
        self.rules = {}
        for url, inurls in rules.items():
            reurl = re.compile(url)
            repatn = []
            for u in inurls:
                repatn.append(re.compile(u))
            self.rules[reurl] = repatn

    def get_patterns_from_rules(self, url):
        patns = []
        for purl, ru in self.rules.items():
            if purl.match(url) != None:
                patns.extend(ru)
        return list(set(patns))

    def getlinks(self, url, html):
        self.webpage = WebPage(url, html)
        self.webpage.parse_links()
        ruptn = self.get_patterns_from_rules(url)
        #print ruptn
        links = self.webpage.filter_links(tags=['a'], patterns=ruptn)
        return links

    def start(self):
        while 1:
            url = self.queue.pop_url()
            print url
            if url == None:
                print "crawling task is done."
                break
            error_msg, url, redirected_url, html = self.downloader.download(
                url)
            #print error_msg, url, redirected_url, html
            if html != None:
                self.webpagedb.html2db(url, html)
                links = self.getlinks(url, html)
                self.add_seeds(links)
            self.mysleep(3)

    def mysleep(self, n):
        for i in range(n):
            time.sleep(1)
            print "sleep", i, "of", n
예제 #7
0
파일: crawler.py 프로젝트: gsliu/Cybertron
class Crawler():

    def __init__(self ):
        self.downloader = DownloadManager()
        self.webpage = None
        self.init_database()
        self.rules = {}
        self.files = []
        self.file_rule = ".+"

    def init_database(self):
        self.queue = QueueDB('queue.db')
        self.webpagedb = WebpageDB('webpage.db')
        self.duplcheck = DuplCheckDB('duplcheck.db')
        self.repodb = RepoStateDB()   
 
    def add_seeds(self, links):
        new_links = self.duplcheck.filter_dupl_urls(links)
        self.duplcheck.add_urls(new_links)
        self.queue.push_urls(new_links)
    
    def add_rules(self, rules):
        self.rules = {}
        for url, inurls in rules.items():
            reurl = re.compile(url)
            repatn = []
            for u in inurls:
                repatn.append(re.compile(u))
            self.rules[reurl] = repatn

    def set_file_rule(self, rule):
        self.file_rule = rule

    def get_patterns_from_rules(self,url):
        patns = []
        for purl,ru in self.rules.items():
            if purl.match(url)!= None:
                patns.extend(ru)
        return list(set(patns))
    
    def download_files(self, files):
        for f in files:
            #cmd = "wget --force-directories -c " + f + " -P " + config.repos_dir
            cmd = "wget -c " + f + " -P " + config.repos_dir
            ret_code = os.system(cmd)
            self.repodb.update(f, ret_code == 0)

    def start(self):
        while 1:
            url = self.queue.pop_url()
            print url
            if url == None:
                print "crawling task is done."
                break
            error_msg, url, redirected_url, html = self.downloader.download(url)
    #        print error_msg, url, redirected_url, html
            if html !=None:
                self.webpagedb.html2db(url,html)
 
                self.webpage = WebPage(url,html)
                self.webpage.parse_links()
                ruptn = self.get_patterns_from_rules(url)
                #print ruptn
                links = self.webpage.filter_links(tags = ['a'], patterns= ruptn)
                print links
                self.add_seeds(links)
                file_pattern = []
                file_pattern.append(re.compile(self.file_rule))
                files = self.webpage.filter_links(tags = ['a'], patterns = file_pattern)
                self.files.append(files)
                #TODO:
                self.download_files(files)
                print files

    def mysleep(self, n):
        for i in range(n):
            time.sleep(1)
            print "sleep",i,"of",n