Пример #1
0
def delete_proxy(ip, port):
    """ 删除数据库中的代理
    :param ip: ip
    :param port: 端口
    """
    if ip != "" and port != "":
        db = DBSession()
        db.query(Proxy).filter(Proxy.ip == ip).filter(Proxy.port == port).delete()
        try:
            db.commit()
            return True
        except exc.SQLAlchemyError, e:
            logging.info("Delete Proxy Error:", format(e))
            return False
Пример #2
0
 def process_item(self, item, spider):
     # try:
     #     con = MySQLdb.connect(**config.db_config)
     #     cur = con.cursor()
     #     sql = "INSERT INTO leiju_proxy (ip,port,proto,checked_at,created_at) VALUES (%s,%s,%s,%s,%s)"
     #     parmam = [(item['ip'], item['port'], 'http', int(time.time()), int(time.time()))]
     #     cur.executemany(sql, parmam)
     #     con.commit()
     #     con.close()
     #     # return item
     # except Exception, e:
     #     logging.info("SaveError: %s:%s %s" % (item['ip'], item['port'], format(e)))
     #     raise DropItem("SaveError: %s:%s %s" % (item['ip'], item['port'], format(e)))
     db = DBSession()
     md5 = hashlib.md5()
     md5.update(item['ip'] + "." + item['port'])
     haship = md5.hexdigest()
     proxy = Proxy(haship=haship,
                   ip=item['ip'],
                   port=item['port'],
                   create_time=int(time.time()))
     db.add(proxy)
     try:
         db.commit()
     except exc.SQLAlchemyError, e:
         raise DropItem("SaveError: %s:%s %s" %
                        (item['ip'], item['port'], format(e)))
Пример #3
0
 def process_item(self, item, spider):
     # try:
     #     con = MySQLdb.connect(**config.db_config)
     #     cur = con.cursor()
     #     sql = "INSERT INTO leiju_proxy (ip,port,proto,checked_at,created_at) VALUES (%s,%s,%s,%s,%s)"
     #     parmam = [(item['ip'], item['port'], 'http', int(time.time()), int(time.time()))]
     #     cur.executemany(sql, parmam)
     #     con.commit()
     #     con.close()
     #     # return item
     # except Exception, e:
     #     logging.info("SaveError: %s:%s %s" % (item['ip'], item['port'], format(e)))
     #     raise DropItem("SaveError: %s:%s %s" % (item['ip'], item['port'], format(e)))
     db = DBSession()
     md5 = hashlib.md5()
     md5.update(item['ip'] + "." + item['port'])
     haship = md5.hexdigest()
     proxy = Proxy(haship=haship, ip=item['ip'], port=item['port'], create_time=int(time.time()))
     db.add(proxy)
     try:
         db.commit()
     except exc.SQLAlchemyError, e:
         raise DropItem("SaveError: %s:%s %s" % (item['ip'], item['port'], format(e)))
Пример #4
0
def delete_proxy(ip, port):
    """ 删除数据库中的代理
    :param ip: ip
    :param port: 端口
    """
    if ip != "" and port != "":
        db = DBSession()
        db.query(Proxy).filter(Proxy.ip == ip).filter(
            Proxy.port == port).delete()
        try:
            db.commit()
            return True
        except exc.SQLAlchemyError, e:
            logging.info("Delete Proxy Error:", format(e))
            return False
Пример #5
0
    def process_item(self, item, spider):
        db = DBSession()
        redis = confRedis

        rule_id = spider.rule_id
        url = item['url']
        md5 = hashlib.md5()
        md5.update(url)
        urlmd5 = md5.hexdigest()
        site_name = utils.get_site(item['url'])
        # site_name = spider.rule['allow_domains']
        html_title = item['html_title']
        # html_body = item['html_body']
        save_path = utils.md5dir(item['url'])
        save_time = int(time.time())
        title = item['title'] if 'title' in item else ""
        body = item['body'] if 'body' in item else ""
        thumb = item['thumb'] if 'thumb' in item else ""
        img_list = item['img_list'] if 'img_list' in item else ""

        # TODO 这里使用一个分析方法,分析抓取到数据的发布时间,然后转换成时间戳
        publish_time = utils.smart2date(item['publish_time']) if 'publish_time' in item else ""
        source_site = item['source_site'] if 'source_site' in item else ""
        flag = default_page_flag

        page = Page(rule_id=rule_id, url=item['url'], urlmd5=urlmd5, site_name=site_name, html_title=html_title,
                    save_path=save_path,
                    save_time=save_time, title=title,
                    thumb=thumb, img_list=img_list,
                    body=body, publish_time=publish_time,
                    source_site=source_site, flag=flag)
        has = db.query(Page).filter(Page.urlmd5 == urlmd5).first()
        if has:
            page = Page(rule_id=rule_id, url=item['url'], site_name=site_name, html_title=html_title,
                        save_path=save_path,
                        save_time=save_time, title=title,
                        thumb=thumb, img_list=img_list,
                        body=body, publish_time=publish_time,
                        source_site=source_site, flag=flag)

        db.add(page)
        try:
            db.commit()
            utils.save_file('%s/%s' % (html_path, save_path), item['html_body'])
            redis.set('url:%s' % url, 1)
        except exc.SQLAlchemyError, e:
            raise DropItem("SaveDbError: %s,%s" % (url, format(e)))
Пример #6
0
def get_proxy_list():
    """ 获取代理列表"""
    db = DBSession()
    proxy_list = db.query(Proxy).all()
    db.close()
    return proxy_list
Пример #7
0
#  python
# -*- coding: utf-8 -*-
# from sqlalchemy import exc
from conf.config import DBSession
from util import WebProxy as webProxy
from model.Proxy import Proxy

db = DBSession()
ips = db.query(Proxy).all()

for item in ips:
    ret = webProxy.check_proxy(item.ip, item.port)
    if not ret:
        webProxy.delete_proxy(item.ip, item.port)

Пример #8
0
    # save_path=TEXT(stored=True)
)
if not os.path.exists(search_path):
    os.mkdir(search_path)

ix = create_in(search_path, schema)
writer = ix.writer()
# fromtimestamp int->time
# time = 0
# datetime.datetime.fromtimestamp(time)
# strptime string->time
# format = '%Y-%m-%d %H:%M:%S' str = '2015-10-27 15:09:38'
# datetime.datetime.strptime(str,format)

timeformat = '%Y-%m-%d %H:%M:%S'
db = DBSession()
""" 过滤条件参照该文件最下面的增量索引的方案 """
# 新增数据
# sql = text('insert into users (u_name, u_password) values (:name, :password)')
# data = db.execute(sql, {'name': 'nate1', 'password': password})
# 删除数据
# sql = text('delete from users where u_id = :id')
#     data = session.execute(sql, {'id': last_id})
# sql = text('select * from users')
#    data = session.execute(sql)

# 查询SQL版本
sql = text('select * from Page where id < :id')
pageList = db.execute(sql, {'id': 20})
# 查询ORM版本
# pageList = db.query(Page).filter(Page.id < 20).all()
Пример #9
0
    def process_item(self, item, spider):
        db = DBSession()
        redis = confRedis

        rule_id = spider.rule_id
        url = item['url']
        md5 = hashlib.md5()
        md5.update(url)
        urlmd5 = md5.hexdigest()
        site_name = utils.get_site(item['url'])
        # site_name = spider.rule['allow_domains']
        html_title = item['html_title']
        # html_body = item['html_body']
        save_path = utils.md5dir(item['url'])
        save_time = int(time.time())
        title = item['title'] if 'title' in item else ""
        body = item['body'] if 'body' in item else ""
        thumb = item['thumb'] if 'thumb' in item else ""
        img_list = item['img_list'] if 'img_list' in item else ""

        # TODO 这里使用一个分析方法,分析抓取到数据的发布时间,然后转换成时间戳
        publish_time = utils.smart2date(
            item['publish_time']) if 'publish_time' in item else ""
        source_site = item['source_site'] if 'source_site' in item else ""
        flag = default_page_flag

        page = Page(rule_id=rule_id,
                    url=item['url'],
                    urlmd5=urlmd5,
                    site_name=site_name,
                    html_title=html_title,
                    save_path=save_path,
                    save_time=save_time,
                    title=title,
                    thumb=thumb,
                    img_list=img_list,
                    body=body,
                    publish_time=publish_time,
                    source_site=source_site,
                    flag=flag)
        has = db.query(Page).filter(Page.urlmd5 == urlmd5).first()
        if has:
            page = Page(rule_id=rule_id,
                        url=item['url'],
                        site_name=site_name,
                        html_title=html_title,
                        save_path=save_path,
                        save_time=save_time,
                        title=title,
                        thumb=thumb,
                        img_list=img_list,
                        body=body,
                        publish_time=publish_time,
                        source_site=source_site,
                        flag=flag)

        db.add(page)
        try:
            db.commit()
            utils.save_file('%s/%s' % (html_path, save_path),
                            item['html_body'])
            redis.set('url:%s' % url, 1)
        except exc.SQLAlchemyError, e:
            raise DropItem("SaveDbError: %s,%s" % (url, format(e)))
Пример #10
0
def get_proxy_list():
    """ 获取代理列表"""
    db = DBSession()
    proxy_list = db.query(Proxy).all()
    db.close()
    return proxy_list
Пример #11
0
# python
# -*- coding: utf-8 -*-
from conf.config import DBSession, log_format, log_file, log_path, log_open, img_save_path
from model.Rule import Rule
from spiders.RuleSpider import RuleSpider

from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
from scrapy.utils.log import configure_logging
import logging
from datetime import datetime
import os

db = DBSession()


def run_spider():
    settings = Settings()
    settings.set("COOKIES_ENABLES", False)  # 禁止cookies追踪
    settings.set(
        "ITEM_PIPELINES",
        {
            'pipelines.ImgPipline': 150,  # 保存图片到本地
            # 'pipelines.CoverImagesPipeline': 150, # 保存图片到七牛云
            'pipelines.SaveCommonPipline': 200,  # 保存数据库
            # 'pipelines.FilterUrlPipline': 300,
        })

    settings.set(
        "DOWNLOADER_MIDDLEWARES",
        {