Exemplo n.º 1
0
def main():
    global SHARE_Q
    global ACTIVE_Q
    threads = []
    session = loadSession()
    proxies = session.query(Proxy).filter(Proxy.type == "HTTP").order_by(
        Proxy.indate.desc()).limit(20000)

    # 向队列中放入任务
    for proxy in proxies:
        SHARE_Q.put(proxy)

    #控制线程数量
    for i in xrange(_WORKER_THREAD_NUM):
        thread = MyThread(worker)
        thread.start()
        threads.append(thread)

    for thread in threads:
        thread.join()

    #当队列ACTIVE_Q中的item不为空时循环执行checkValid()
    while not ACTIVE_Q.empty():
        item = ACTIVE_Q.get()
        checkValid(item)
Exemplo n.º 2
0
def checkValid(item):
    starttime = datetime.datetime.now()
    rst = checkProxy(proxyIP=item.ip_port, protocol="http", timeout=5)
    costtimie = (datetime.datetime.now() - starttime).seconds
    if rst is not None and rst["status"] == "ok":

        proxy = freshProxy(ip_port=item.ip_port,
                           type=item.type,
                           location=rst["rstLocation"].encode("utf-8"),
                           speed=costtimie,
                           source=item.source,
                           rule_id=item.rule_id,
                           lastcheck=datetime.datetime.now())

        print rst["rstIP"]
        print rst["rstLocation"].encode("utf-8")
        session = loadSession()
        try:
            session.merge(proxy)
            session.commit()
        except MySQLdb.IntegrityError, e:
            print e.message
Exemplo n.º 3
0
from scrapy.settings import Settings
from main.spiders import config
from main.spiders.model.rules import Rule
from main.spiders.model import loadSession
from main.spiders.proxy_spider import ProxySpiderSpider

settings = Settings()


settings.set("ITEM_PIPELINES" ,config.ITEM_PIPELINES)

settings.set("DEFAULT_REQUEST_HEADERS",config.DEFAULT_REQUEST_HEADERS)

settings.set("DOWNLOADER_MIDDLEWARES",config.DOWNLOADER_MIDDLEWARES)

settings.set("DOWNLOAD_DELAY",config.DOWNLOAD_DELAY)

settings.set("COOKIES_ENABLED",config.COOKIES_ENABLED)

settings.set("ROBOTSTXT_OBEY",config.ROBOTSTXT_OBEY)

process = CrawlerProcess(settings)

session=loadSession()


rules = session.query(Rule).filter(Rule.enable == 1)
for rule in rules:
    print rule.id
    process.crawl(ProxySpiderSpider,rule)
process.start()
Exemplo n.º 4
0
def deleteProxy(item):
    session = loadSession()
    session.query(Proxy).filter(Proxy.ip_port == item.ip_port).delete()
    session.commit()