Пример #1
0
 def __init__(self, path, base_url, count):
     self.url_manager = URLManager()
     self.data_save = DataSave(path)
     self.html_parser = HtmlParse()
     self.downloader = HtmlDownloader()
     self.root_url = base_url
     self.count = count
Пример #2
0
class Scheduler:
    def __init__(self,path,root_url,count):
        # 初始化各个组件
        self.url_manager = URLManager()
        self.data_save = DataSave(path)
        self.html_parser = HtmlParse()
        self.downloader = HtmlDownloader()
        self.root_url = root_url
        self.count = count

    def run_spider(self):
        # 先添加一条url到未爬取url集合中
        self.url_manager.save_new_url(self.root_url)
        # 判断:如果未爬取url集合中还有网址,并且还没有爬取到50篇文章,那么继续爬去
        while self.url_manager.get_new_url_num() and self.url_manager.get_old_url_num() < self.count:
            try:
                # 获取一条未爬取url
                url = self.url_manager.get_new_url()
                # 下载数据
                response = self.downloader.download(url)
                # 分析数据,返回url与文章相关数据
                new_urls,data = self.html_parser.parse_data(url,response)
                # 将获取到的url保存到未爬取url集合中
                self.url_manager.save_new_urls(new_urls)
                # 保存数据到本地文件
                self.data_save.save(data)
                print("已经抓取了{0}篇文章".format(len(self.url_manager.old_urls)))
            except Exception as e:
                print("本篇文章抓取停止,{0}".format(e))
Пример #3
0
class Scheduler:
    def __init__(self, path, base_url, count):
        self.url_manager = URLManager()
        self.data_save = DataSave(path)
        self.html_parser = HtmlParse()
        self.downloader = HtmlDownloader()
        self.root_url = base_url
        self.count = count

    def run_spider(self):
        self.url_manager.save_new_url(self.root_url)
        while self.url_manager.get_new_url_num(
        ) and self.url_manager.get_old_url_num() < self.count:
            try:
                url = self.url_manager.get_new_url()
                response = self.downloader.download(url)
                new_urls, data = self.html_parser.parse_data(response, url)
                self.url_manager.save_new_urls(new_urls)
                self.data_save.save(data)
                print("Already spider {} pages".format(
                    len(self.url_manager.old_urls)))
            except Exception as e:
                print("Error", e)
Пример #4
0
if __name__ == "__main__":
    import sys
    print sys.argv
    rootURL = ''
    if len(sys.argv) > 1:
        rootURL = sys.argv[1]
    else:
        rootURL = 'vatgia.com'
    print 'Crawl ', rootURL

    from urlmanager import URLManager
    from urlrequest import URLRequest
    from storage import MySQLDBStorage
    from htmlprocessor import VGHMTLProcessor

    storage = MySQLDBStorage('localhost', 'root', 'adminpass', 'nc_vgdb')
    urlManager = URLManager(storage)
    processor = VGHMTLProcessor(storage)
    urlManager.addURL('http://{0}'.format(rootURL))
    urlManager.addURL('http://{0}/home'.format(rootURL))
    urlManager.addURL('http://{0}/home/'.format(rootURL))

    urlRequest = URLRequest('testlink.txt')
    while 1:
        nextURL = urlManager.nextURL()
        if not nextURL:
            break
        urlRequest.execute(nextURL, urlManager, processor)
    
Пример #5
0
from peewee import *
from urlmanager import URLManager
from time import sleep

urlman = URLManager()

db = MySQLDatabase("dev_bookmarks",user="******",passwd="hkGVUX26w8ivEP")

class Bookmarks(Model):
    id = IntegerField()
    url = CharField()
    domain = CharField()
    title = CharField()

    class Meta:
        database = db

for bk in Bookmarks.select():
    updated = False
    if bk.domain == None:
        bk.domain = urlman.get_domain(bk.url)
        updated = True
    if bk.title == None or "CloudFlare" in bk.title or bk.title == "Too Many Requests":
        bk.title = urlman.get_title(bk.url)
        updated = True
    if updated:    
        print "Updating " + bk.url + "..."
        bk.save()
        sleep(3)
Пример #6
0
from BeautifulSoup import BeautifulSoup
import urllib
from urlmanager import URLManager

url = 'http://www.aflcio.org/Legislation-and-Politics/Legislative-Alerts'

umgr = URLManager()

print umgr.get_title(url)
print umgr.get_domain(url)