Пример #1
0
    def parse(self, response):
        """ 列表页解析 """

        last_md5 = ''
        if self.isFirstListPage:
            checkText = self.safeParse(response, self.checkTxtXpath)
            last_md5 = toMd5(checkText)

        logging.info("*********last_md5 : %s   self.last_md5 : %s*****" %
                     (last_md5, self.last_md5))
        if (
                not self.is_duplicate
        ) and OPEN_MD5_CHECK and self.isFirstListPage and last_md5 == self.last_md5:
            yield []
        else:
            for request in self.getDetailPageUrls(response):
                yield request

            # 获取下一列表页url
            if not self.isDone:
                for request in self.getNextListPageUrl(response):
                    yield request

            # 同步md5码 & 同步last_id
            if self.isFirstListPage:
                syncLastMd5({'last_md5': last_md5, 'id': self.rule_id})

        self.isFirstListPage = False
Пример #2
0
    def process_item(self, item, spider):

        if not item:
            logging.info('--------item is empty : %s' % item)
            return True

        rule_id = item['rule_id']
        public_time = int(time.time())
        create_time = int(time.time())

        img_url = json.dumps(item['img_url'])
        description = item['description']
        if not description:
            return True

        title = item['title'].decode('utf8')[0:255].encode('utf8')
        insertData = {
            'source_url': item['source_url'],
            'unique_code': toMd5(item['source_url']),
            'rule_id': rule_id,
            'title': title,
            'description': description,
            'img_url': img_url,
            'public_time': public_time,
            'create_time': create_time
        }
        self.db.insert(self.tableName, insertData)
        return True
Пример #3
0
    def parse_node(self, response):

        self.currentNode = response
        # logging.info("*********meta******%s****************" % response.meta['spiderConfig'])
        self.initConfig(response.meta['spiderConfig'])

        checkText = self.safeParse(self.checkTxtXpath)
        last_md5 = toMd5(checkText)
        if last_md5 == response.meta['spiderConfig'].get('last_md5', ''):
            yield []
        else:
            item = XmlFeedItem()
            item['title'] = [t.encode('utf-8') for t in self.safeParse(self.titleXpath)]

            imageAndDescriptionInfos = self.parseDescriptionAndImages()
            item['img_url'] = imageAndDescriptionInfos['img_url']
            item['description'] = imageAndDescriptionInfos['description']

            item['public_time'] = [p.encode('utf-8') for p in self.safeParse(self.pubDateXpath)]
            item['source_url'] = [g.encode('utf-8') for g in self.safeParse(self.guidXpath)]
            item['rule_id'] = self.rule_id
            yield item

            # update md5 to mysql
            spiderConfig = getCrawlNoRssRequest({'last_md5': last_md5, 'id': self.rule_id})
            if spiderConfig:
                yield Request(spiderConfig.get('start_urls', '')[0],
                              headers={'Referer': 'http://www.google.com'},
                              meta={'spiderConfig': spiderConfig},
                              callback=self.parse_node,
                              dont_filter=True)
Пример #4
0
    def process_item(self, item, spider):

        if not item:
            logging.info('--------item is empty : %s' % item)
            return True

        create_time = int(time.time())
        img_url = json.dumps(item['img_url'])
        if (not item['description']) and (not item['content']):
            return True

        title = item['title'].decode('utf8')[0:255].encode('utf8')
        insertData = {
            'source_url': item['source_url'],
            'unique_code': toMd5(item['source_url']),
            'rule_id': item['rule_id'],
            'title': title,
            'description': item['description'],
            'content': item['content'],
            'img_url': img_url,
            'source_score': item['source_score'],
            'is_sync': '0',
            'public_time': item['public_time'],
            'create_time': create_time
        }
        insertOk = self.db.insert(self.tableName, insertData)
        if (not insertOk) and spider.is_duplicate:
            self.db.update(self.tableName, insertData,
                           "unique_code = '" + insertData['unique_code'] + "'")
            logging.info('========update.unique_code : %s' %
                         insertData['unique_code'])

        return True
Пример #5
0
    def parse(self, response):
        """ 列表页解析 """

        last_md5 = ''
        if self.isFirstListPage:
            checkText = self.safeParse(response, self.checkTxtXpath)
            last_md5 = toMd5(checkText)

        logging.info("*********last_md5 : %s   self.last_md5 : %s*****" % (last_md5, self.last_md5))
        if self.isFirstListPage and last_md5 == self.last_md5:
            yield []
        else:
            for request in self.getDetailPageUrls(response):
                yield request

            # 获取下一列表页url
            if not self.isDone:
                for request in self.getNextListPageUrl(response):
                    yield request

            # 同步md5码 & 同步last_id
            if self.isFirstListPage:
                syncLastMd5({'last_md5': last_md5, 'id': self.rule_id})

        self.isFirstListPage = False
Пример #6
0
    def parse(self, data):

        RssItemList = {}
        # CollectionHelper.printEx(data)
        for i in data.entries:
            RssItem = {}
            RssItem['source_url'] = i.get('link', '')
            if not RssItem['source_url']:
                continue

            RssItem['unique_code'] = toMd5(RssItem['source_url'])
            RssItem['rule_id'] = self.rule_id
            RssItem['title'] = i.get('title', '')

            text = self.parse_content(i)
            tmpInfos = self.parseContentAndImg(text)
            RssItem['content'] = tmpInfos['content']
            RssItem['img_url'] = json.dumps(
                tmpInfos['img_url']) if tmpInfos['img_url'] else ""

            RssItem['description'] = self.parseDescription(i)
            RssItem['public_time'] = self.parse_public_time(i)
            RssItem['create_time'] = int(time.time())
            RssItemList[RssItem['unique_code']] = RssItem
            # print RssItem
        self.pipeline.process_item(RssItemList)
Пример #7
0
    def process_item(self, item, spider):

        if not item:
            logging.info('--------item is empty : %s' % item)
            return True

        create_time = int(time.time())
        img_url = json.dumps(item['img_url'])
        if (not item['description']) and (not item['content']):
            return True

        title = item['title'].decode('utf8')[0:255].encode('utf8')
        insertData = {
            'source_url': item['source_url'],
            'unique_code': toMd5(item['source_url']),
            'rule_id': item['rule_id'],
            'title': title,
            'description': item['description'],
            'content': item['content'],
            'img_url': img_url,
            'source_score' : item['source_score'],
            'is_sync' : '0',
            'public_time': item['public_time'],
            'create_time': create_time
        }
        insertOk = self.db.insert(self.tableName, insertData)
        if ( not insertOk )and spider.is_duplicate:
            self.db.update(self.tableName, insertData, "unique_code = '" + insertData['unique_code'] + "'")
            logging.info('========update.unique_code : %s' % insertData['unique_code'])

        return True
Пример #8
0
    def parse(self, data):

        RssItemList = {}
        # CollectionHelper.printEx(data)
        for i in data.entries:
            RssItem = {}
            RssItem['source_url'] = i.get('link', '')
            if not RssItem['source_url']:
                continue

            RssItem['unique_code'] = toMd5(RssItem['source_url'])
            RssItem['rule_id'] = self.rule_id
            RssItem['title'] = i.get('title', '')

            text = self.parse_content(i)
            tmpInfos = self.parseContentAndImg(text)
            RssItem['content'] = tmpInfos['content']
            RssItem['img_url'] = json.dumps(tmpInfos['img_url']) if tmpInfos['img_url'] else ""

            RssItem['description'] = self.parseDescription(i)
            RssItem['public_time'] = self.parse_public_time(i)
            RssItem['create_time'] = int(time.time())
            RssItemList[RssItem['unique_code']] = RssItem
            # print RssItem
        self.pipeline.process_item(RssItemList)
Пример #9
0
    def process_item(self, item, spider):

        if not item:
            logging.info('--------item is empty : %s' % item)
            return True

        rule_id = item['rule_id']
        public_time = int(time.time())
        create_time = int(time.time())

        img_url = json.dumps(item['img_url'])
        description = item['description']
        if not description:
            return True

        title = item['title'].decode('utf8')[0:255].encode('utf8')
        insertData = {
            'source_url': item['source_url'],
            'unique_code': toMd5(item['source_url']),
            'rule_id': rule_id,
            'title': title,
            'description': description,
            'img_url': img_url,
            'public_time': public_time,
            'create_time': create_time
        }
        self.db.insert(self.tableName, insertData)
        return True
Пример #10
0
    def run(self, config):

        self.initConfig(config)
        d = feedparser.parse(config.get('start_urls', '')[0])

        # md5校验
        last_md5 = toMd5(d.entries)
        logging.info("*********last_md5 : %s   self.last_md5 : %s*****" % (last_md5, self.last_md5))
        if OPEN_MD5_CHECK and self.last_md5 == last_md5:
            return True

        self.parse(d)  # 解析rss
        syncLastMd5({'last_md5': last_md5, 'id': self.rule_id})
Пример #11
0
    def run(self, config):

        self.initConfig(config)
        d = feedparser.parse(config.get('start_urls', '')[0])

        # md5校验
        last_md5 = toMd5(d.entries)
        logging.info("*********last_md5 : %s   self.last_md5 : %s*****" %
                     (last_md5, self.last_md5))
        if OPEN_MD5_CHECK and self.last_md5 == last_md5:
            return True

        self.parse(d)  # 解析rss
        syncLastMd5({'last_md5': last_md5, 'id': self.rule_id})
Пример #12
0
    def distinctRequestUrls(self, urls):

        if len(urls) < 1:
            return []

        uniqueCodeDict = {}
        for url in urls:
            uniqueCodeDict[toMd5(url)] = url

        # logging.info("*********uniqueCodeDict : %s   *****" % uniqueCodeDict)
        repeatUniqueCode = requstDistinct(uniqueCodeDict.keys())
        # logging.info("*********repeatUniqueCode : %s   *****" % repeatUniqueCode)

        for i, unique in enumerate(repeatUniqueCode):
            del(uniqueCodeDict[unique])
        return uniqueCodeDict.values()
Пример #13
0
    def distinctRequestUrls(self, urls):

        if len(urls) < 1:
            return []

        if (not OPEN_REDIS_DISTINCT) or self.is_duplicate:
            return list(urls)

        uniqueCodeDict = {}
        for url in urls:
            uniqueCodeDict[toMd5(url)] = url

        repeatUniqueCode = requstDistinct(uniqueCodeDict.keys())
        for i, unique in enumerate(repeatUniqueCode):
            del(uniqueCodeDict[unique])
        return uniqueCodeDict.values()
Пример #14
0
    def distinctRequestUrls(self, urls):

        if len(urls) < 1:
            return []

        if (not OPEN_REDIS_DISTINCT) or self.is_duplicate:
            return list(urls)

        uniqueCodeDict = {}
        for url in urls:
            uniqueCodeDict[toMd5(url)] = url

        repeatUniqueCode = requstDistinct(uniqueCodeDict.keys())
        for i, unique in enumerate(repeatUniqueCode):
            del (uniqueCodeDict[unique])
        return uniqueCodeDict.values()
Пример #15
0
def syncCrawlInfos(dataList):

    try:
        http = HttpRequest()
        http.setTimeout(900)
        url = sync_crawl_infos_url
        sqlList = json.dumps(dataList)
        body = {"sql": sqlList, "checksum": toMd5(sqlList)}
        encryptFields = []
        headerDict = {"Content-Encoding": "gzip", "Accept-Encoding": "gzip"}
        response = http.setUrl(url).setBody(body).setHeader(headerDict).encrypt(encryptFields).post()
        res = json.loads(response)["data"]
        if not res:
            return []
        return res
    except Exception, e:
        res = []
        logging.info("-----------%s-------" % e, True)
        return res
Пример #16
0
    def filterAndPackageDgrate(self):

        uniqueCodeList = []
        insertData = {}
        item = self.item

        rule_id = item['rule_id']
        public_time = int(time.time())
        create_time = int(time.time())

        for index, title in enumerate(item['title']):

            uniqueCode = toMd5(item['source_url'][index])
            if index < len(item['img_url']) and item['img_url'][index]:
                img_url = json.dumps(item['img_url'][index])
            else:
                img_url = ''

            if index < len(item['description']) and item['description'][index]:
                description = item['description'][index]
            else:
                continue

            title = title.decode('utf8')[0:255].encode('utf8')
            uniqueCodeList.append(uniqueCode)
            insertData[uniqueCode] = {
                'source_url': item['source_url'][index],
                'unique_code': uniqueCode,
                'rule_id': rule_id,
                'title': title,
                'description': description,
                'img_url': img_url,
                'public_time': public_time,
                'create_time': create_time
            }

        if uniqueCodeList and OPEN_REDIS_DISTINCT:
            repeatUniqueCode = requstDistinct(uniqueCodeList)
            for i, unique in enumerate(repeatUniqueCode):
                del (insertData[unique])

        return insertData
Пример #17
0
    def filterAndPackageDgrate(self):

        uniqueCodeList = []
        insertData = {}
        item = self.item

        rule_id = item['rule_id']
        public_time = int(time.time())
        create_time = int(time.time())

        for index, title in enumerate(item['title']):

            uniqueCode = toMd5(item['source_url'][index])
            if index < len(item['img_url']) and item['img_url'][index]:
                img_url = json.dumps(item['img_url'][index])
            else:
                img_url = ''

            if index < len(item['description']) and item['description'][index]:
                description = item['description'][index]
            else:
                continue

            title = title.decode('utf8')[0:255].encode('utf8')
            uniqueCodeList.append(uniqueCode)
            insertData[uniqueCode] = {
                'source_url': item['source_url'][index],
                'unique_code': uniqueCode,
                'rule_id': rule_id,
                'title': title,
                'description': description,
                'img_url': img_url,
                'public_time': public_time,
                'create_time': create_time
            }

        if uniqueCodeList and OPEN_REDIS_DISTINCT:
            repeatUniqueCode = requstDistinct(uniqueCodeList)
            for i, unique in enumerate(repeatUniqueCode):
                del(insertData[unique])

        return insertData
Пример #18
0
def syncCrawlInfos(dataList):

    try:
        http = HttpRequest()
        http.setTimeout(900)
        url = sync_crawl_infos_url
        sqlList = json.dumps(dataList)
        body = {'sql': sqlList, 'checksum': toMd5(sqlList)}
        encryptFields = []
        headerDict = {'Content-Encoding': 'gzip', 'Accept-Encoding': "gzip"}
        response = http.setUrl(url).setBody(body).setHeader(
            headerDict).encrypt(encryptFields).post()
        res = json.loads(response)['data']
        if not res:
            return []
        return res
    except Exception, e:
        res = []
        logging.info('-----------%s-------' % e, True)
        return res
Пример #19
0
    def process_item(self, item, spider):

        if not item:
            logging.info('-----------------------list page repeat : %s' % item)
            return True

        public_time = int(time.time())
        create_time = int(time.time())

        for i in xrange(0, len(item['url'])):
            insertData = {
                'title': item['title'][i],
                'url': item['url'][i],
                'unique_code': toMd5(item['url'][i]),
                'share_num': item['share_num'][i],
                'rss_num': item['rss_num'][i],
                'public_time': public_time,
                'create_time': create_time
            }
            self.db.insert(self.tableName, insertData)

        return True
Пример #20
0
    def process_item(self, item, spider):

        if not item:
            logging.info('-----------------------list page repeat : %s' % item)
            return True

        public_time = int(time.time())
        create_time = int(time.time())

        for i in xrange(0, len(item['url'])):
            insertData = {
                'title': item['title'][i],
                'url': item['url'][i],
                'unique_code': toMd5(item['url'][i]),
                'share_num': item['share_num'][i],
                'rss_num': item['rss_num'][i],
                'public_time': public_time,
                'create_time': create_time
            }
            self.db.insert(self.tableName, insertData)

        return True
Пример #21
0
#!/usr/bin/env python
# coding:utf8

import time
from scrapy.crawler import CrawlerProcess
from mySpiders.utils.http import (getCrawlRssRequestLength,getCrawlRssRequest,
    getCrawlNoRssRequestLength,getCrawlNoRssRequest, requstDistinct, syncLastMd5, syncCrawlInfos)

from mySpiders.utils.CollectionHelper import CollectionHelper

from mySpiders.utils.hash import toMd5
from mySpiders.sql.syncCrawlInfos import SyncCrawlInfos


param = {'last_md5': toMd5(str(time.time())),
         'id': '1'}
# print syncLastMd5(param)
# print getCrawlRssRequestLength()
# print getCrawlRssRequest()
# print getCrawlNoRssRequestLength()
# print getCrawlNoRssRequest()

param = [{'4bbce00021506aecf54ba5884a415b16': 'http://blog.csdn.net/hj7jay/article/details/51148995'},
         {'b158d4b6473444ebfbaa2969c99c9e13': 'http://blog.csdn.net/hj7jay/article/details/51149155'},
         {'8d5bba3e45d473946872fc9c5afb94db': 'http://blog.csdn.net/hj7jay/article/details/51149167'},
         {'7b8efef578dcb2c1fc5ecf5490bc60d5': 'http://blog.csdn.net/hj7jay/article/details/51149227'},
         {'2b56b1e675b88d62db17dffa7171068b': 'http://blog.csdn.net/hj7jay/article/details/51149268'},
         {'3c3aed33445faba46168e0b83b5992e8': 'http://blog.csdn.net/hj7jay/article/details/51149207'},
         {'a068fd93771ed6f885294ee6b0069a50': 'http://blog.csdn.net/hj7jay/article/details/51149192'},
         {'56fb1745506c47314b95e5f8a6839bbc': 'http://blog.csdn.net/hj7jay/article/details/51149049'},
         {'18ddd7f50ef05e770663b7e531536284': 'http://blog.csdn.net/hj7jay/article/details/51149026'},