def delete_proxy(ip, port): """ 删除数据库中的代理 :param ip: ip :param port: 端口 """ if ip != "" and port != "": db = DBSession() db.query(Proxy).filter(Proxy.ip == ip).filter(Proxy.port == port).delete() try: db.commit() return True except exc.SQLAlchemyError, e: logging.info("Delete Proxy Error:", format(e)) return False
def process_item(self, item, spider): # try: # con = MySQLdb.connect(**config.db_config) # cur = con.cursor() # sql = "INSERT INTO leiju_proxy (ip,port,proto,checked_at,created_at) VALUES (%s,%s,%s,%s,%s)" # parmam = [(item['ip'], item['port'], 'http', int(time.time()), int(time.time()))] # cur.executemany(sql, parmam) # con.commit() # con.close() # # return item # except Exception, e: # logging.info("SaveError: %s:%s %s" % (item['ip'], item['port'], format(e))) # raise DropItem("SaveError: %s:%s %s" % (item['ip'], item['port'], format(e))) db = DBSession() md5 = hashlib.md5() md5.update(item['ip'] + "." + item['port']) haship = md5.hexdigest() proxy = Proxy(haship=haship, ip=item['ip'], port=item['port'], create_time=int(time.time())) db.add(proxy) try: db.commit() except exc.SQLAlchemyError, e: raise DropItem("SaveError: %s:%s %s" % (item['ip'], item['port'], format(e)))
def delete_proxy(ip, port): """ 删除数据库中的代理 :param ip: ip :param port: 端口 """ if ip != "" and port != "": db = DBSession() db.query(Proxy).filter(Proxy.ip == ip).filter( Proxy.port == port).delete() try: db.commit() return True except exc.SQLAlchemyError, e: logging.info("Delete Proxy Error:", format(e)) return False
def process_item(self, item, spider): db = DBSession() redis = confRedis rule_id = spider.rule_id url = item['url'] md5 = hashlib.md5() md5.update(url) urlmd5 = md5.hexdigest() site_name = utils.get_site(item['url']) # site_name = spider.rule['allow_domains'] html_title = item['html_title'] # html_body = item['html_body'] save_path = utils.md5dir(item['url']) save_time = int(time.time()) title = item['title'] if 'title' in item else "" body = item['body'] if 'body' in item else "" thumb = item['thumb'] if 'thumb' in item else "" img_list = item['img_list'] if 'img_list' in item else "" # TODO 这里使用一个分析方法,分析抓取到数据的发布时间,然后转换成时间戳 publish_time = utils.smart2date(item['publish_time']) if 'publish_time' in item else "" source_site = item['source_site'] if 'source_site' in item else "" flag = default_page_flag page = Page(rule_id=rule_id, url=item['url'], urlmd5=urlmd5, site_name=site_name, html_title=html_title, save_path=save_path, save_time=save_time, title=title, thumb=thumb, img_list=img_list, body=body, publish_time=publish_time, source_site=source_site, flag=flag) has = db.query(Page).filter(Page.urlmd5 == urlmd5).first() if has: page = Page(rule_id=rule_id, url=item['url'], site_name=site_name, html_title=html_title, save_path=save_path, save_time=save_time, title=title, thumb=thumb, img_list=img_list, body=body, publish_time=publish_time, source_site=source_site, flag=flag) db.add(page) try: db.commit() utils.save_file('%s/%s' % (html_path, save_path), item['html_body']) redis.set('url:%s' % url, 1) except exc.SQLAlchemyError, e: raise DropItem("SaveDbError: %s,%s" % (url, format(e)))
def get_proxy_list(): """ 获取代理列表""" db = DBSession() proxy_list = db.query(Proxy).all() db.close() return proxy_list
# python # -*- coding: utf-8 -*- # from sqlalchemy import exc from conf.config import DBSession from util import WebProxy as webProxy from model.Proxy import Proxy db = DBSession() ips = db.query(Proxy).all() for item in ips: ret = webProxy.check_proxy(item.ip, item.port) if not ret: webProxy.delete_proxy(item.ip, item.port)
# save_path=TEXT(stored=True) ) if not os.path.exists(search_path): os.mkdir(search_path) ix = create_in(search_path, schema) writer = ix.writer() # fromtimestamp int->time # time = 0 # datetime.datetime.fromtimestamp(time) # strptime string->time # format = '%Y-%m-%d %H:%M:%S' str = '2015-10-27 15:09:38' # datetime.datetime.strptime(str,format) timeformat = '%Y-%m-%d %H:%M:%S' db = DBSession() """ 过滤条件参照该文件最下面的增量索引的方案 """ # 新增数据 # sql = text('insert into users (u_name, u_password) values (:name, :password)') # data = db.execute(sql, {'name': 'nate1', 'password': password}) # 删除数据 # sql = text('delete from users where u_id = :id') # data = session.execute(sql, {'id': last_id}) # sql = text('select * from users') # data = session.execute(sql) # 查询SQL版本 sql = text('select * from Page where id < :id') pageList = db.execute(sql, {'id': 20}) # 查询ORM版本 # pageList = db.query(Page).filter(Page.id < 20).all()
def process_item(self, item, spider): db = DBSession() redis = confRedis rule_id = spider.rule_id url = item['url'] md5 = hashlib.md5() md5.update(url) urlmd5 = md5.hexdigest() site_name = utils.get_site(item['url']) # site_name = spider.rule['allow_domains'] html_title = item['html_title'] # html_body = item['html_body'] save_path = utils.md5dir(item['url']) save_time = int(time.time()) title = item['title'] if 'title' in item else "" body = item['body'] if 'body' in item else "" thumb = item['thumb'] if 'thumb' in item else "" img_list = item['img_list'] if 'img_list' in item else "" # TODO 这里使用一个分析方法,分析抓取到数据的发布时间,然后转换成时间戳 publish_time = utils.smart2date( item['publish_time']) if 'publish_time' in item else "" source_site = item['source_site'] if 'source_site' in item else "" flag = default_page_flag page = Page(rule_id=rule_id, url=item['url'], urlmd5=urlmd5, site_name=site_name, html_title=html_title, save_path=save_path, save_time=save_time, title=title, thumb=thumb, img_list=img_list, body=body, publish_time=publish_time, source_site=source_site, flag=flag) has = db.query(Page).filter(Page.urlmd5 == urlmd5).first() if has: page = Page(rule_id=rule_id, url=item['url'], site_name=site_name, html_title=html_title, save_path=save_path, save_time=save_time, title=title, thumb=thumb, img_list=img_list, body=body, publish_time=publish_time, source_site=source_site, flag=flag) db.add(page) try: db.commit() utils.save_file('%s/%s' % (html_path, save_path), item['html_body']) redis.set('url:%s' % url, 1) except exc.SQLAlchemyError, e: raise DropItem("SaveDbError: %s,%s" % (url, format(e)))
# python # -*- coding: utf-8 -*- from conf.config import DBSession, log_format, log_file, log_path, log_open, img_save_path from model.Rule import Rule from spiders.RuleSpider import RuleSpider from scrapy.crawler import CrawlerProcess from scrapy.settings import Settings from scrapy.utils.log import configure_logging import logging from datetime import datetime import os db = DBSession() def run_spider(): settings = Settings() settings.set("COOKIES_ENABLES", False) # 禁止cookies追踪 settings.set( "ITEM_PIPELINES", { 'pipelines.ImgPipline': 150, # 保存图片到本地 # 'pipelines.CoverImagesPipeline': 150, # 保存图片到七牛云 'pipelines.SaveCommonPipline': 200, # 保存数据库 # 'pipelines.FilterUrlPipline': 300, }) settings.set( "DOWNLOADER_MIDDLEWARES", {