class LogDao(object): def __init__(self, logger, belongTo=''): self.connector = Connector() self.belongTo = belongTo self.logger = logger ''' scrapy_log id info 具体内容 level 级别:info warn save_time 时间 belong_to 所属模块 attach 附加信息 ''' def queryAll(self): cursor = self.connector.cursor() if not cursor: return [] sql_query = "select id,info,level,save_time,belong_to,attach from scrapy_log " cursor.execute(sql_query) results = cursor.fetchall() cursor.close() return results or [] def queryPart(self, belong_to): cursor = self.connector.cursor() if not cursor: return [] sql_query = "select id,info,level,save_time,belong_to,attach from scrapy_log where belong_to=%s" cursor.execute(sql_query, (belong_to, )) results = cursor.fetchall() cursor.close() return results or [] def save(self, info, level, belong_to='', attach=''): cursor = self.connector.cursor() if not cursor: return self.logger.info(belong_to + info) cursor = self.connector.cursor() sql_query = 'insert into scrapy_log (info,level,save_time,belong_to,attach) values (%s,%s,%s,%s,%s)' save_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) cursor.execute(sql_query, (info, level, save_time, belong_to, attach)) cursor.close() self.connector.commit() def info(self, info, belong_to='', attach=''): belong_to = belong_to if belong_to else self.belongTo self.save(info, 'info', belong_to, attach) def warn(self, info, belong_to='', attach=''): belong_to = belong_to if belong_to else self.belongTo self.save(info, 'warn', belong_to, attach)
class MysqlPipeline(object): def __init__(self): self.connector = Connector() def process_item(self, item, spider): cursor = self.connector.cursor() if not cursor: spider.logDao.info(u'数据库连接不上暂时不存:' + item['title']) return item if isinstance(item, ContentItem): # 如果存在,则不做处理 spider.logDao.info(u'存微信详情:' + item['title']) sql = "insert into weixin_detail (" \ "content_txt,title,source_url,post_date,sub_channel,post_user,tags,styles," \ "content_html,hash_code,info_type,src_source_id,src_account_id,src_channel,src_ref,wx_account,update_time) " \ "values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) " update_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) try: cursor.execute(sql, ( item['content_txt'], item['title'], item['source_url'], item['post_date'], item['sub_channel'], item['post_user'], item['tags'], item['styles'], item['content_html'], item['hash_code'], item['info_type'], item['src_source_id'], item['src_account_id'], item['src_channel'], item['src_ref'], item['wx_account'], update_time)) spider.logDao.info(u'存微信详情:' + item['title'] + u' 成功' + u' ' + item['post_date']) except Exception as e: spider.logDao.warn(str(e)) spider.logDao.warn(u'存微信详情:' + item['title'] + u' 失败') else: pass cursor.close() self.connector.commit() return item def close_spider(self, spider): self.connector.commit()
class CheckDao(object): def __init__(self): self.connector = Connector() self.hashList = [] # 代表此次已经存在的hash,防止同一时间得到相同文章进行抓取 def resetHashList(self): # 每次重新抓取的时候清除 self.hashList = [] def checkExist(self, source_url): """ 存在逻辑判断 :return: """ hash_code = self.getHashCode(source_url) cursor = self.connector.cursor() if not cursor: return True sql_query = 'select id from tengxun_detail where hash_code=%s' cursor.execute(sql_query, (hash_code, )) results = cursor.fetchall() cursor.close() if results or self.isInHashList(hash_code): return True else: return False def isInHashList(self, hash_code): return hash_code in self.hashList def getHashCode(self, source_url): # 具体逻辑 return EncryptUtil.md5(source_url) def getHtml(self, pageIndex): """ 获取所有html逻辑 :return: """ cursor = self.connector.cursor() if not cursor: return [] sql_query = 'select id,content_html from tengxun_detail group by id limit %s, %s' cursor.execute(sql_query, ((pageIndex - 1) * 15, 15)) results = cursor.fetchall() cursor.close() return results or [] def getPostTime(self, pageIndex): """ 时间逻辑 :return: """ cursor = self.connector.cursor() if not cursor: return [] sql_query = 'select id,post_date from tengxun_detail group by id limit %s, %s' cursor.execute(sql_query, ((pageIndex - 1) * 15, 15)) results = cursor.fetchall() cursor.close() return results or [] def updatePostTime(self, id, post_date): cursor = self.connector.cursor() if not cursor: return sql_query = "update tengxun_detail set post_date=%s where id=%s" cursor.execute(sql_query, (post_date, id)) cursor.close() self.connector.commit() def updateStyles(self, id, styles): cursor = self.connector.cursor() if not cursor: return sql_query = "update tengxun_detail set styles=%s where id=%s" cursor.execute(sql_query, (styles, id)) cursor.close() self.connector.commit() def updateHtml(self, id, content_html): cursor = self.connector.cursor() if not cursor: return sql_query = "update tengxun_detail set content_html=%s where id=%s" cursor.execute(sql_query, (content_html, id)) cursor.close() self.connector.commit() # checkDao = CheckDao() # pageIndex = 1 # while True: # results = checkDao.getPostTime(pageIndex) # if not len(results): # print 'end' # break # print 'pageIndex', pageIndex # for result in results: # id, post_date = result # print 'in: ', id, post_date # # post_date = post_date.replace(u'\xa0', u' ') # # 处理标签 # try: # # post_date = time.strptime(post_date, "%Y-%m-%d %H:%M:%S") # time.strftime("%Y-%m-%d %H:%M:%S", ) # timeArray = time.strptime(str(post_date), "%Y-%m-%d %H:%M:%S") # update_time_long = int(time.mktime(timeArray)) # except Exception as e: # print u'出错..........................................................................' # pass # print 'out: ', id, update_time_long # x = time.localtime(update_time_long) # print 'out: ', id, time.strftime('%Y-%m-%d %H:%M:%S', x) # # checkDao.updatePostTime(id, post_date) # pageIndex += 1
class CheckDao(object): def __init__(self): self.connector = Connector() self.hashList = [] # 代表此次已经存在的hash,防止同一时间得到相同文章进行抓取 def resetHashList(self): # 每次重新抓取的时候清除 self.hashList = [] def checkExist(self, source_url): """ 存在逻辑判断 """ hash_code = self.getHashCode(source_url) cursor = self.connector.cursor() if not cursor: return True sql_query = 'select id from sina_detail where hash_code=%s' cursor.execute(sql_query, (hash_code, )) results = cursor.fetchall() cursor.close() if results or self.isInHashList(hash_code): return True else: self.hashList.append(hash_code) return False def isInHashList(self, hash_code): return hash_code in self.hashList def getHashCode(self, source_url): # 具体逻辑 return EncryptUtil.md5(source_url) def getAllHtml(self): """ 存在逻辑判断 """ cursor = self.connector.cursor() if not cursor: return [] sql_query = 'select id,styles,content_html from sina_detail' cursor.execute(sql_query) results = cursor.fetchall() cursor.close() return results or [] def update(self, id, styles, content_html): cursor = self.connector.cursor() if not cursor: return sql_query = "update sina_detail set content_html=%s,styles=%s where id=%s" cursor.execute(sql_query, (content_html, styles, id)) cursor.close() self.connector.commit() def getHtml(self, pageIndex): """ 存在逻辑判断 """ cursor = self.connector.cursor() if not cursor: return [] sql_query = 'select id,content_html from sina_detail group by id limit %s, %s' cursor.execute(sql_query, ((pageIndex - 1) * 15, 15)) results = cursor.fetchall() cursor.close() return results or [] def updateStyles(self, id, styles): cursor = self.connector.cursor() if not cursor: return sql_query = "update sina_detail set styles=%s where id=%s" cursor.execute(sql_query, (styles, id)) cursor.close() self.connector.commit() def updateHtml(self, id, content_html): cursor = self.connector.cursor() if not cursor: return sql_query = "update sina_detail set content_html=%s where id=%s" cursor.execute(sql_query, (content_html, id)) cursor.close() self.connector.commit()
class DataMonitorDao(object): def __init__(self): self.connector = Connector() def getAllHeartBeatTime(self, cursor_out=None): """ :param cursor_out: :param types: sina_heartbeat tengxun_heartbeat wangyi_heartbeat weixin_heartbeat weixin_source_heartbeat :return: 得到心跳更新时间 """ if not cursor_out: cursor = self.connector.cursor() if not cursor: return else: cursor = cursor_out typeStr = " where type in ('sina_heartbeat','tengxun_heartbeat','wangyi_heartbeat','weixin_heartbeat','weixin_source_heartbeat')" sql_query = "select type, update_time from data_monitor " + typeStr try: cursor.execute(sql_query) except Exception as e: print e.msg results = cursor.fetchall() if not cursor_out: cursor.close() self.connector.close() return results or [] def heartBeat(self, type=''): """ :param type: sina_heartbeat tengxun_heartbeat wangyi_heartbeat weixin_heartbeat weixin_source_heartbeat :return: 更新跳动时间 """ if not type: return cursor = self.connector.cursor() if not cursor: return info = u'跳动中' remark = '' if type == 'sina_heartbeat': remark = u'新浪_心跳1分钟更新一次' elif type == 'tengxun_heartbeat': remark = u'腾讯_心跳1分钟更新一次' elif type == 'wangyi_heartbeat': remark = u'网易_心跳1分钟更新一次' elif type == 'weixin_heartbeat': remark = u'微信_心跳1分钟更新一次' elif type == 'weixin_source_heartbeat': remark = u'微信源_心跳1分钟更新一次' # 检测是否存在type , update_time , info , remark, account if self.checkExist(cursor_out=cursor, type=type): update_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) sql_query = "update data_monitor set update_time=%s,info=%s,remark=%s where type=%s" values = (update_time, info, remark, type) else: sql_query = "INSERT INTO data_monitor ( type, update_time, info, remark) VALUES (" \ "%s, %s, %s, %s); " update_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) values = (type, update_time, info, remark) print u'跳一下', update_time try: cursor.execute(sql_query, values) except Exception as e: print e.msg cursor.close() self.connector.commit() self.connector.close() def updateTotal(self, type, account=''): """ :param account: :param type: sina_total tengxun_total wangyi_total weixin_total weixin_source_total weixin_account_total :return: 更新总数 """ if not type: return cursor = self.connector.cursor() if not cursor: return count = self.getTotal(cursor_out=cursor, type=type, account=account) info = u'总数:' + str(count) remark = '' if type == 'sina_total': remark = u'新浪条数' elif type == 'tengxun_total': remark = u'腾讯条数' elif type == 'wangyi_total': remark = u'网易条数' elif type == 'weixin_total': remark = u'微信条数' elif type == 'weixin_source_total': remark = u'微信源条数' elif type == 'weixin_account_total': remark = u'微信源_对应账号条数' # 检测是否存在type , update_time , info , remark, account if self.checkExist(cursor_out=cursor, type=type, account=account): update_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) sql_query = "update data_monitor set update_time=%s,info=%s,remark=%s where type=%s and account=%s" values = (update_time, info, remark, type, account) else: sql_query = "INSERT INTO data_monitor ( type, update_time, info, remark, account) VALUES (" \ "%s, %s, %s, %s, %s); " update_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) values = (type, update_time, info, remark, account) try: cursor.execute(sql_query, values) except Exception as e: print e.msg cursor.close() self.connector.commit() def checkExist(self, cursor_out=None, type='', account=''): """ :param cursor_out: :param account: :param type: sina_total tengxun_total wangyi_total weixin_total weixin_source_total weixin_account_total :return: 得到总数 """ if not cursor_out: cursor = self.connector.cursor() if not cursor: return else: cursor = cursor_out accountWhere = '' if account: accountWhere = " and account='%s'" % account sql_query = "select * from data_monitor where type=%s" + accountWhere try: cursor.execute(sql_query, (type, )) except Exception as e: print e.msg results = cursor.fetchall() if not cursor_out: cursor.close() if results: return True else: return False def getTotal(self, cursor_out=None, type='', account=''): """ :param cursor_out: :param account: :param type: sina_total tengxun_total wangyi_total weixin_total weixin_source_total weixin_account_total :return: 得到总数 """ if not cursor_out: cursor = self.connector.cursor() if not cursor: return else: cursor = cursor_out table = '' accountWhere = '' if type == 'sina_total': table = 'sina_detail' elif type == 'tengxun_total': table = 'tengxun_detail' elif type == 'wangyi_total': table = 'wangyi_detail' elif type == 'weixin_total': table = 'weixin_detail' elif type == 'weixin_source_total': table = 'weixin_source' elif type == 'weixin_account_total': table = 'weixin_detail' if not account: return 0 accountWhere = " where wx_account='%s'" % account if not table: return 0 sql_query = "select count(*) from " + table + accountWhere try: cursor.execute(sql_query) except Exception as e: print e.msg results = cursor.fetchone() if not cursor_out: cursor.close() if results: (count, ) = results return count else: return 0
class WxSourceDao(object): def __init__(self): self.connector = Connector() self.orderType = 'desc' # 用于判断是升序还是降序 def queryEnable(self, isRandom=False): cursor = self.connector.cursor() if not cursor: return [] if self.orderType == "desc": self.orderType = "asc" else: self.orderType = "desc" # """ # select wx_name,wx_account,wx_url,wx_avatar,update_status,is_enable,update_time from weixin_source # where is_enable='1' # and # ( # (update_status='last' and round((UNIX_TIMESTAMP(NOW())-UNIX_TIMESTAMP(update_time))/60)>20) # or # (update_status='updating' and round((UNIX_TIMESTAMP(NOW())-UNIX_TIMESTAMP(update_time))/60)>20) # or update_status='updateFail' # or update_status='none' # ) # order by id # """ # 可用的 且( 更新状态为last且时间大于20分钟/ 更新状态为updating且时间大于20分钟/更新状态为updating/更新状态为none) sql_query = "select id,wx_name,wx_account,wx_url,wx_avatar,update_status,is_enable,update_time from weixin_source " \ "where is_enable='1' and ((update_status='last' and round((UNIX_TIMESTAMP(NOW())-UNIX_TIMESTAMP(" \ "update_time))/60)>40) or (update_status='updating' and round((UNIX_TIMESTAMP(NOW())-UNIX_TIMESTAMP(" \ "update_time))/60)>40) or update_status='updateFail' or update_status='none') order by id "+self.orderType cursor.execute(sql_query) results = cursor.fetchall() cursor.close() results = results or [] # print "sources长度", len(results) if isRandom and results: # 随机排序 防止出现都是请求同一个 random.shuffle(results) return results def queryWxUrl_special(self, isRandom=False, wx_accounts=[]): """ 获取wxUrl有值,且是有效的 :return: """ cursor = self.connector.cursor() if not cursor: return [] if self.orderType == "desc": self.orderType = "asc" else: self.orderType = "desc" accountStr = '' if len(wx_accounts): accountStr = ','.join(wx_accounts) accountStr = "and wx_account in ('%s')"%accountStr sql_query = "select id,wx_name,wx_account,wx_url,wx_avatar,update_status,is_enable,update_time from weixin_source " \ "where is_enable='1' and wx_url !='' "+accountStr+" order by id " + self.orderType cursor.execute(sql_query) results = cursor.fetchall() cursor.close() results = results or [] if isRandom and results: # 随机排序 防止出现都是请求同一个 random.shuffle(results) return results def queryWxUrl(self, isRandom=False): """ 获取wxUrl有值,且是有效的 :return: """ cursor = self.connector.cursor() if not cursor: return [] if self.orderType == "desc": self.orderType = "asc" else: self.orderType = "desc" sql_query = "select id,wx_name,wx_account,wx_url,wx_avatar,update_status,is_enable,update_time from weixin_source " \ "where is_enable='1' and wx_url !='' order by id " + self.orderType cursor.execute(sql_query) results = cursor.fetchall() cursor.close() results = results or [] if isRandom and results: # 随机排序 防止出现都是请求同一个 random.shuffle(results) return results def updateStatus(self, wx_account, update_status): cursor = self.connector.cursor() if not cursor: return sql_query = "update weixin_source set update_status=%s,update_time=%s where wx_account=%s" update_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) cursor.execute(sql_query, (update_status, update_time, wx_account)) cursor.close() self.connector.commit() def resetUpdating(self): """ 重置更新中的的为updateFail,如果出现网络问题,似乎无法回调到而是会一直retry,所以先尝试手动在外部重置 """ cursor = self.connector.cursor() if not cursor: return sql_query = "update weixin_source set update_status='updateFail',update_time=%s where update_status='updating'" update_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) cursor.execute(sql_query, (update_time,)) cursor.close() self.connector.commit() def updateSource(self, wx_account, wx_name, wx_url, update_status): cursor = self.connector.cursor() if not cursor: return sql_query = "update weixin_source set wx_name=%s,wx_url=%s,update_status=%s,update_time=%s where " \ "wx_account=%s " update_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) cursor.execute(sql_query, (wx_name, wx_url, update_status, update_time, wx_account)) cursor.close() self.connector.commit()
class CheckDao(object): def __init__(self): self.connector = Connector() self.hashList = [] # 代表此次已经存在的hash,防止同一时间得到相同文章进行抓取 def resetHashList(self): # 每次重新抓取的时候清除 self.hashList = [] def checkExist(self, source_url): """ 存在逻辑判断 """ hash_code = self.getHashCode(source_url) cursor = self.connector.cursor() if not cursor: return True sql_query = 'select id from wangyi_detail where hash_code=%s' cursor.execute(sql_query, (hash_code,)) results = cursor.fetchall() cursor.close() if results or self.isInHashList(hash_code): return True else: self.hashList.append(hash_code) return False def isInHashList(self, hash_code): return hash_code in self.hashList def getHashCode(self, source_url): # 具体逻辑 return EncryptUtil.md5(source_url) def getHtml(self, pageIndex): """ 存在逻辑判断 """ cursor = self.connector.cursor() if not cursor: return [] sql_query = 'select id,content_html from wangyi_detail group by id limit %s, %s' cursor.execute(sql_query, ((pageIndex-1)*15, 15)) results = cursor.fetchall() cursor.close() return results or [] def updateStyles(self, id, styles): cursor = self.connector.cursor() if not cursor: return sql_query = "update wangyi_detail set styles=%s where id=%s" cursor.execute(sql_query, (styles, id)) cursor.close() self.connector.commit() def updateHtml(self, id, content_html): cursor = self.connector.cursor() if not cursor: return sql_query = "update wangyi_detail set content_html=%s where id=%s" cursor.execute(sql_query, (content_html, id)) cursor.close() self.connector.commit() # 去除alt和title内容 # checkDao = CheckDao() # pageIndex = 1 # while True: # results = checkDao.getHtml(pageIndex) # if not len(results): # print 'end' # break # print 'pageIndex', pageIndex # for result in results: # id, content_html = result # # 处理标签 # print id # selector = Selector(text=content_html) # imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract() # # 处理提示块img的 alt title, 关注//img/@alt|//img/@title # print len(imgAltTitles) # for imgAltTitle in imgAltTitles: # if imgAltTitle.strip(' '): # print 'here', imgAltTitle # content_html = content_html.replace(imgAltTitle, '') # # 更新 # checkDao.updateHtml(id, content_html) # pageIndex += 1