def parse(self, response): """ 列表页解析 """ last_md5 = '' if self.isFirstListPage: checkText = self.safeParse(response, self.checkTxtXpath) last_md5 = toMd5(checkText) logging.info("*********last_md5 : %s self.last_md5 : %s*****" % (last_md5, self.last_md5)) if ( not self.is_duplicate ) and OPEN_MD5_CHECK and self.isFirstListPage and last_md5 == self.last_md5: yield [] else: for request in self.getDetailPageUrls(response): yield request # 获取下一列表页url if not self.isDone: for request in self.getNextListPageUrl(response): yield request # 同步md5码 & 同步last_id if self.isFirstListPage: syncLastMd5({'last_md5': last_md5, 'id': self.rule_id}) self.isFirstListPage = False
def process_item(self, item, spider): if not item: logging.info('--------item is empty : %s' % item) return True rule_id = item['rule_id'] public_time = int(time.time()) create_time = int(time.time()) img_url = json.dumps(item['img_url']) description = item['description'] if not description: return True title = item['title'].decode('utf8')[0:255].encode('utf8') insertData = { 'source_url': item['source_url'], 'unique_code': toMd5(item['source_url']), 'rule_id': rule_id, 'title': title, 'description': description, 'img_url': img_url, 'public_time': public_time, 'create_time': create_time } self.db.insert(self.tableName, insertData) return True
def parse_node(self, response): self.currentNode = response # logging.info("*********meta******%s****************" % response.meta['spiderConfig']) self.initConfig(response.meta['spiderConfig']) checkText = self.safeParse(self.checkTxtXpath) last_md5 = toMd5(checkText) if last_md5 == response.meta['spiderConfig'].get('last_md5', ''): yield [] else: item = XmlFeedItem() item['title'] = [t.encode('utf-8') for t in self.safeParse(self.titleXpath)] imageAndDescriptionInfos = self.parseDescriptionAndImages() item['img_url'] = imageAndDescriptionInfos['img_url'] item['description'] = imageAndDescriptionInfos['description'] item['public_time'] = [p.encode('utf-8') for p in self.safeParse(self.pubDateXpath)] item['source_url'] = [g.encode('utf-8') for g in self.safeParse(self.guidXpath)] item['rule_id'] = self.rule_id yield item # update md5 to mysql spiderConfig = getCrawlNoRssRequest({'last_md5': last_md5, 'id': self.rule_id}) if spiderConfig: yield Request(spiderConfig.get('start_urls', '')[0], headers={'Referer': 'http://www.google.com'}, meta={'spiderConfig': spiderConfig}, callback=self.parse_node, dont_filter=True)
def process_item(self, item, spider): if not item: logging.info('--------item is empty : %s' % item) return True create_time = int(time.time()) img_url = json.dumps(item['img_url']) if (not item['description']) and (not item['content']): return True title = item['title'].decode('utf8')[0:255].encode('utf8') insertData = { 'source_url': item['source_url'], 'unique_code': toMd5(item['source_url']), 'rule_id': item['rule_id'], 'title': title, 'description': item['description'], 'content': item['content'], 'img_url': img_url, 'source_score': item['source_score'], 'is_sync': '0', 'public_time': item['public_time'], 'create_time': create_time } insertOk = self.db.insert(self.tableName, insertData) if (not insertOk) and spider.is_duplicate: self.db.update(self.tableName, insertData, "unique_code = '" + insertData['unique_code'] + "'") logging.info('========update.unique_code : %s' % insertData['unique_code']) return True
def parse(self, response): """ 列表页解析 """ last_md5 = '' if self.isFirstListPage: checkText = self.safeParse(response, self.checkTxtXpath) last_md5 = toMd5(checkText) logging.info("*********last_md5 : %s self.last_md5 : %s*****" % (last_md5, self.last_md5)) if self.isFirstListPage and last_md5 == self.last_md5: yield [] else: for request in self.getDetailPageUrls(response): yield request # 获取下一列表页url if not self.isDone: for request in self.getNextListPageUrl(response): yield request # 同步md5码 & 同步last_id if self.isFirstListPage: syncLastMd5({'last_md5': last_md5, 'id': self.rule_id}) self.isFirstListPage = False
def parse(self, data): RssItemList = {} # CollectionHelper.printEx(data) for i in data.entries: RssItem = {} RssItem['source_url'] = i.get('link', '') if not RssItem['source_url']: continue RssItem['unique_code'] = toMd5(RssItem['source_url']) RssItem['rule_id'] = self.rule_id RssItem['title'] = i.get('title', '') text = self.parse_content(i) tmpInfos = self.parseContentAndImg(text) RssItem['content'] = tmpInfos['content'] RssItem['img_url'] = json.dumps( tmpInfos['img_url']) if tmpInfos['img_url'] else "" RssItem['description'] = self.parseDescription(i) RssItem['public_time'] = self.parse_public_time(i) RssItem['create_time'] = int(time.time()) RssItemList[RssItem['unique_code']] = RssItem # print RssItem self.pipeline.process_item(RssItemList)
def process_item(self, item, spider): if not item: logging.info('--------item is empty : %s' % item) return True create_time = int(time.time()) img_url = json.dumps(item['img_url']) if (not item['description']) and (not item['content']): return True title = item['title'].decode('utf8')[0:255].encode('utf8') insertData = { 'source_url': item['source_url'], 'unique_code': toMd5(item['source_url']), 'rule_id': item['rule_id'], 'title': title, 'description': item['description'], 'content': item['content'], 'img_url': img_url, 'source_score' : item['source_score'], 'is_sync' : '0', 'public_time': item['public_time'], 'create_time': create_time } insertOk = self.db.insert(self.tableName, insertData) if ( not insertOk )and spider.is_duplicate: self.db.update(self.tableName, insertData, "unique_code = '" + insertData['unique_code'] + "'") logging.info('========update.unique_code : %s' % insertData['unique_code']) return True
def parse(self, data): RssItemList = {} # CollectionHelper.printEx(data) for i in data.entries: RssItem = {} RssItem['source_url'] = i.get('link', '') if not RssItem['source_url']: continue RssItem['unique_code'] = toMd5(RssItem['source_url']) RssItem['rule_id'] = self.rule_id RssItem['title'] = i.get('title', '') text = self.parse_content(i) tmpInfos = self.parseContentAndImg(text) RssItem['content'] = tmpInfos['content'] RssItem['img_url'] = json.dumps(tmpInfos['img_url']) if tmpInfos['img_url'] else "" RssItem['description'] = self.parseDescription(i) RssItem['public_time'] = self.parse_public_time(i) RssItem['create_time'] = int(time.time()) RssItemList[RssItem['unique_code']] = RssItem # print RssItem self.pipeline.process_item(RssItemList)
def run(self, config): self.initConfig(config) d = feedparser.parse(config.get('start_urls', '')[0]) # md5校验 last_md5 = toMd5(d.entries) logging.info("*********last_md5 : %s self.last_md5 : %s*****" % (last_md5, self.last_md5)) if OPEN_MD5_CHECK and self.last_md5 == last_md5: return True self.parse(d) # 解析rss syncLastMd5({'last_md5': last_md5, 'id': self.rule_id})
def distinctRequestUrls(self, urls): if len(urls) < 1: return [] uniqueCodeDict = {} for url in urls: uniqueCodeDict[toMd5(url)] = url # logging.info("*********uniqueCodeDict : %s *****" % uniqueCodeDict) repeatUniqueCode = requstDistinct(uniqueCodeDict.keys()) # logging.info("*********repeatUniqueCode : %s *****" % repeatUniqueCode) for i, unique in enumerate(repeatUniqueCode): del(uniqueCodeDict[unique]) return uniqueCodeDict.values()
def distinctRequestUrls(self, urls): if len(urls) < 1: return [] if (not OPEN_REDIS_DISTINCT) or self.is_duplicate: return list(urls) uniqueCodeDict = {} for url in urls: uniqueCodeDict[toMd5(url)] = url repeatUniqueCode = requstDistinct(uniqueCodeDict.keys()) for i, unique in enumerate(repeatUniqueCode): del(uniqueCodeDict[unique]) return uniqueCodeDict.values()
def distinctRequestUrls(self, urls): if len(urls) < 1: return [] if (not OPEN_REDIS_DISTINCT) or self.is_duplicate: return list(urls) uniqueCodeDict = {} for url in urls: uniqueCodeDict[toMd5(url)] = url repeatUniqueCode = requstDistinct(uniqueCodeDict.keys()) for i, unique in enumerate(repeatUniqueCode): del (uniqueCodeDict[unique]) return uniqueCodeDict.values()
def syncCrawlInfos(dataList): try: http = HttpRequest() http.setTimeout(900) url = sync_crawl_infos_url sqlList = json.dumps(dataList) body = {"sql": sqlList, "checksum": toMd5(sqlList)} encryptFields = [] headerDict = {"Content-Encoding": "gzip", "Accept-Encoding": "gzip"} response = http.setUrl(url).setBody(body).setHeader(headerDict).encrypt(encryptFields).post() res = json.loads(response)["data"] if not res: return [] return res except Exception, e: res = [] logging.info("-----------%s-------" % e, True) return res
def filterAndPackageDgrate(self): uniqueCodeList = [] insertData = {} item = self.item rule_id = item['rule_id'] public_time = int(time.time()) create_time = int(time.time()) for index, title in enumerate(item['title']): uniqueCode = toMd5(item['source_url'][index]) if index < len(item['img_url']) and item['img_url'][index]: img_url = json.dumps(item['img_url'][index]) else: img_url = '' if index < len(item['description']) and item['description'][index]: description = item['description'][index] else: continue title = title.decode('utf8')[0:255].encode('utf8') uniqueCodeList.append(uniqueCode) insertData[uniqueCode] = { 'source_url': item['source_url'][index], 'unique_code': uniqueCode, 'rule_id': rule_id, 'title': title, 'description': description, 'img_url': img_url, 'public_time': public_time, 'create_time': create_time } if uniqueCodeList and OPEN_REDIS_DISTINCT: repeatUniqueCode = requstDistinct(uniqueCodeList) for i, unique in enumerate(repeatUniqueCode): del (insertData[unique]) return insertData
def filterAndPackageDgrate(self): uniqueCodeList = [] insertData = {} item = self.item rule_id = item['rule_id'] public_time = int(time.time()) create_time = int(time.time()) for index, title in enumerate(item['title']): uniqueCode = toMd5(item['source_url'][index]) if index < len(item['img_url']) and item['img_url'][index]: img_url = json.dumps(item['img_url'][index]) else: img_url = '' if index < len(item['description']) and item['description'][index]: description = item['description'][index] else: continue title = title.decode('utf8')[0:255].encode('utf8') uniqueCodeList.append(uniqueCode) insertData[uniqueCode] = { 'source_url': item['source_url'][index], 'unique_code': uniqueCode, 'rule_id': rule_id, 'title': title, 'description': description, 'img_url': img_url, 'public_time': public_time, 'create_time': create_time } if uniqueCodeList and OPEN_REDIS_DISTINCT: repeatUniqueCode = requstDistinct(uniqueCodeList) for i, unique in enumerate(repeatUniqueCode): del(insertData[unique]) return insertData
def syncCrawlInfos(dataList): try: http = HttpRequest() http.setTimeout(900) url = sync_crawl_infos_url sqlList = json.dumps(dataList) body = {'sql': sqlList, 'checksum': toMd5(sqlList)} encryptFields = [] headerDict = {'Content-Encoding': 'gzip', 'Accept-Encoding': "gzip"} response = http.setUrl(url).setBody(body).setHeader( headerDict).encrypt(encryptFields).post() res = json.loads(response)['data'] if not res: return [] return res except Exception, e: res = [] logging.info('-----------%s-------' % e, True) return res
def process_item(self, item, spider): if not item: logging.info('-----------------------list page repeat : %s' % item) return True public_time = int(time.time()) create_time = int(time.time()) for i in xrange(0, len(item['url'])): insertData = { 'title': item['title'][i], 'url': item['url'][i], 'unique_code': toMd5(item['url'][i]), 'share_num': item['share_num'][i], 'rss_num': item['rss_num'][i], 'public_time': public_time, 'create_time': create_time } self.db.insert(self.tableName, insertData) return True
#!/usr/bin/env python # coding:utf8 import time from scrapy.crawler import CrawlerProcess from mySpiders.utils.http import (getCrawlRssRequestLength,getCrawlRssRequest, getCrawlNoRssRequestLength,getCrawlNoRssRequest, requstDistinct, syncLastMd5, syncCrawlInfos) from mySpiders.utils.CollectionHelper import CollectionHelper from mySpiders.utils.hash import toMd5 from mySpiders.sql.syncCrawlInfos import SyncCrawlInfos param = {'last_md5': toMd5(str(time.time())), 'id': '1'} # print syncLastMd5(param) # print getCrawlRssRequestLength() # print getCrawlRssRequest() # print getCrawlNoRssRequestLength() # print getCrawlNoRssRequest() param = [{'4bbce00021506aecf54ba5884a415b16': 'http://blog.csdn.net/hj7jay/article/details/51148995'}, {'b158d4b6473444ebfbaa2969c99c9e13': 'http://blog.csdn.net/hj7jay/article/details/51149155'}, {'8d5bba3e45d473946872fc9c5afb94db': 'http://blog.csdn.net/hj7jay/article/details/51149167'}, {'7b8efef578dcb2c1fc5ecf5490bc60d5': 'http://blog.csdn.net/hj7jay/article/details/51149227'}, {'2b56b1e675b88d62db17dffa7171068b': 'http://blog.csdn.net/hj7jay/article/details/51149268'}, {'3c3aed33445faba46168e0b83b5992e8': 'http://blog.csdn.net/hj7jay/article/details/51149207'}, {'a068fd93771ed6f885294ee6b0069a50': 'http://blog.csdn.net/hj7jay/article/details/51149192'}, {'56fb1745506c47314b95e5f8a6839bbc': 'http://blog.csdn.net/hj7jay/article/details/51149049'}, {'18ddd7f50ef05e770663b7e531536284': 'http://blog.csdn.net/hj7jay/article/details/51149026'},