Пример #1
0
    def getNextListPageUrl(self, response):

        requestUrl = []
        self.max_deepth -= 1
        if self.max_deepth < 1:
            logging.info("*********max_deepth : %s   *****" % self.max_deepth)
            return requestUrl

        # logging.info("*********next_request_url : %s   *****" % self.next_request_url)
        nextListPageURL = self.safeParse(response, self.next_request_url)

        # logging.info("*********next_page_url_prefix : %s   *****" % self.next_page_url_prefix)
        if self.next_page_url_prefix:
            nextListPageURL = self.appendDomain(nextListPageURL,
                                                self.next_page_url_prefix,
                                                False)
        else:
            nextListPageURL = self.appendDomain(nextListPageURL, response.url)

        logging.info("*********nextListPageURL : %s   *****" % nextListPageURL)

        if nextListPageURL:
            requestUrl.append(
                Request(nextListPageURL,
                        headers={'Referer': REFERER},
                        callback=self.parse,
                        dont_filter=True))
        return requestUrl
Пример #2
0
    def index(self):

        beginTime = int(time.time())
        records = self.getRecords()
        if not records:
            logging.info('no data need sync!!')
            return False

        syncOverData = syncCrawlInfos(records)

        for record in records:
            uniqueCode = record['unique_code']
            if uniqueCode in syncOverData:
                print "sync success %s " % uniqueCode
                updateSql = "update " + self.tableName + " set `is_sync` = 1,`sync_times` = `sync_times`+1 where `unique_code` = '" + uniqueCode + "' "
            else:
                print "sync fail %s " % uniqueCode
                updateSql = "update " + self.tableName + " set `sync_times` = `sync_times`+1 where `unique_code` = '" + uniqueCode + "' "
            self.db.executeSql(updateSql)

        logging.info('--------------sync records cast time : %s ' %
                     (int(time.time()) - beginTime))
        logging.info('--------------sync records success num : %s' %
                     len(syncOverData))
        logging.info('--------------sync records success : %s' % syncOverData)
        logging.info('--------------sync records fail num : %s' %
                     (len(records) - len(syncOverData)))
        return True
Пример #3
0
def main():
    try:
        runSpider = RunSpider()
        runSpider.run()
        logging.info("----------runSpider end-----------")
    except Exception, e:
        logging.info("----------runSpider main function Exception : %s-----" % e)
Пример #4
0
    def index(self):

        beginTime = int(time.time())
        records = self.getRecords()
        if not records:
            logging.info('no data need sync!!')
            return False

        syncOverData = syncCrawlInfos(records)

        for record in records:
            uniqueCode = record['unique_code']
            if uniqueCode in syncOverData:
                print "sync success %s " % uniqueCode
                updateSql = "update "+self.tableName+" set `is_sync` = 1,`sync_times` = `sync_times`+1 where `unique_code` = '"+uniqueCode+"' "
            else:
                print "sync fail %s " % uniqueCode
                updateSql = "update "+self.tableName+" set `sync_times` = `sync_times`+1 where `unique_code` = '"+uniqueCode+"' "
            self.db.executeSql(updateSql)


        logging.info('--------------sync records cast time : %s ' % (int(time.time()) - beginTime)  )
        logging.info('--------------sync records success num : %s' % len(syncOverData))
        logging.info('--------------sync records success : %s' % syncOverData )
        logging.info('--------------sync records fail num : %s' % (len(records) - len(syncOverData)))
        return True
Пример #5
0
    def process_item(self, item, spider):

        if not item:
            logging.info('--------item is empty : %s' % item)
            return True

        rule_id = item['rule_id']
        public_time = int(time.time())
        create_time = int(time.time())

        img_url = json.dumps(item['img_url'])
        description = item['description']
        if not description:
            return True

        title = item['title'].decode('utf8')[0:255].encode('utf8')
        insertData = {
            'source_url': item['source_url'],
            'unique_code': toMd5(item['source_url']),
            'rule_id': rule_id,
            'title': title,
            'description': description,
            'img_url': img_url,
            'public_time': public_time,
            'create_time': create_time
        }
        self.db.insert(self.tableName, insertData)
        return True
Пример #6
0
    def process_item(self, item, spider):

        if not item:
            logging.info('--------item is empty : %s' % item)
            return True

        create_time = int(time.time())
        img_url = json.dumps(item['img_url'])
        if (not item['description']) and (not item['content']):
            return True

        title = item['title'].decode('utf8')[0:255].encode('utf8')
        insertData = {
            'source_url': item['source_url'],
            'unique_code': toMd5(item['source_url']),
            'rule_id': item['rule_id'],
            'title': title,
            'description': item['description'],
            'content': item['content'],
            'img_url': img_url,
            'source_score': item['source_score'],
            'is_sync': '0',
            'public_time': item['public_time'],
            'create_time': create_time
        }
        insertOk = self.db.insert(self.tableName, insertData)
        if (not insertOk) and spider.is_duplicate:
            self.db.update(self.tableName, insertData,
                           "unique_code = '" + insertData['unique_code'] + "'")
            logging.info('========update.unique_code : %s' %
                         insertData['unique_code'])

        return True
Пример #7
0
    def parse(self, response):
        """ 列表页解析 """

        last_md5 = ''
        if self.isFirstListPage:
            checkText = self.safeParse(response, self.checkTxtXpath)
            last_md5 = toMd5(checkText)

        logging.info("*********last_md5 : %s   self.last_md5 : %s*****" %
                     (last_md5, self.last_md5))
        if (
                not self.is_duplicate
        ) and OPEN_MD5_CHECK and self.isFirstListPage and last_md5 == self.last_md5:
            yield []
        else:
            for request in self.getDetailPageUrls(response):
                yield request

            # 获取下一列表页url
            if not self.isDone:
                for request in self.getNextListPageUrl(response):
                    yield request

            # 同步md5码 & 同步last_id
            if self.isFirstListPage:
                syncLastMd5({'last_md5': last_md5, 'id': self.rule_id})

        self.isFirstListPage = False
Пример #8
0
    def parse(self, response):
        """ 列表页解析 """

        last_md5 = ''
        if self.isFirstListPage:
            checkText = self.safeParse(response, self.checkTxtXpath)
            last_md5 = toMd5(checkText)

        logging.info("*********last_md5 : %s   self.last_md5 : %s*****" % (last_md5, self.last_md5))
        if self.isFirstListPage and last_md5 == self.last_md5:
            yield []
        else:
            for request in self.getDetailPageUrls(response):
                yield request

            # 获取下一列表页url
            if not self.isDone:
                for request in self.getNextListPageUrl(response):
                    yield request

            # 同步md5码 & 同步last_id
            if self.isFirstListPage:
                syncLastMd5({'last_md5': last_md5, 'id': self.rule_id})

        self.isFirstListPage = False
Пример #9
0
    def getDetailPageUrls(self, response):

        detailUrls = [
            self.appendDomain(t.encode('utf-8'), response.url)
            for t in self.safeParse(response, self.rule, True, False)
        ]

        # 批量验证urls是否重复
        logging.info("*********detailUrls : %s   *****" % detailUrls)
        detailUrlsByFilter = self.distinctRequestUrls(detailUrls)
        logging.info("*********detailUrlsByFilter : %s   *****" %
                     detailUrlsByFilter)

        if len(detailUrls) < 1 or len(detailUrlsByFilter) != len(detailUrls):
            self.isDone = True

        requestUrl = []
        if detailUrlsByFilter:
            for detailUrl in detailUrlsByFilter:
                requestUrl.append(
                    Request(detailUrl,
                            headers={'Referer': REFERER},
                            callback=self.parse_detail_page,
                            dont_filter=True))
        return requestUrl
Пример #10
0
    def run(self):

        while True:

            self.syncDagrame()
            logging.info("---------------sleep %s senconds " % MAIN_LOOP_SLEEP_TIME)
            time.sleep(MAIN_LOOP_SLEEP_TIME)
Пример #11
0
    def process_item(self, item, spider):

        if not item:
            logging.info('--------item is empty : %s' % item)
            return True

        rule_id = item['rule_id']
        public_time = int(time.time())
        create_time = int(time.time())

        img_url = json.dumps(item['img_url'])
        description = item['description']
        if not description:
            return True

        title = item['title'].decode('utf8')[0:255].encode('utf8')
        insertData = {
            'source_url': item['source_url'],
            'unique_code': toMd5(item['source_url']),
            'rule_id': rule_id,
            'title': title,
            'description': description,
            'img_url': img_url,
            'public_time': public_time,
            'create_time': create_time
        }
        self.db.insert(self.tableName, insertData)
        return True
Пример #12
0
    def process_item(self, item, spider):

        if not item:
            logging.info('--------item is empty : %s' % item)
            return True

        create_time = int(time.time())
        img_url = json.dumps(item['img_url'])
        if (not item['description']) and (not item['content']):
            return True

        title = item['title'].decode('utf8')[0:255].encode('utf8')
        insertData = {
            'source_url': item['source_url'],
            'unique_code': toMd5(item['source_url']),
            'rule_id': item['rule_id'],
            'title': title,
            'description': item['description'],
            'content': item['content'],
            'img_url': img_url,
            'source_score' : item['source_score'],
            'is_sync' : '0',
            'public_time': item['public_time'],
            'create_time': create_time
        }
        insertOk = self.db.insert(self.tableName, insertData)
        if ( not insertOk )and spider.is_duplicate:
            self.db.update(self.tableName, insertData, "unique_code = '" + insertData['unique_code'] + "'")
            logging.info('========update.unique_code : %s' % insertData['unique_code'])

        return True
Пример #13
0
    def appendDomain(self, url, domain=''):

        parsed_uri = urlparse.urlparse(domain)
        domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
        logging.info("*********apend before : %s   *****" % url)
        if isinstance(url, (buffer, str)) and not self.url_domain_pattern.match(url):
            url = urlparse.urljoin(domain, url)
        return url
Пример #14
0
    def syncDagrame(self):
        """同步数据到线上"""

        if int(time.time()) - self.beginTime > RUN_SYNC_INTERVAL_TIME:
            logging.info("**********sync crawl infos ************")
            sync = SyncCrawlInfos()
            sync.index()
            self.beginTime = int(time.time())
Пример #15
0
def main():
    try:
        runSpider = RunSpider()
        runSpider.run()
        logging.info("----------runSpider end-----------")
    except Exception, e:
        logging.info("----------runSpider main function Exception : %s-----" %
                     e)
Пример #16
0
    def run(self):

        while True:

            self.syncDagrame()
            logging.info("---------------sleep %s senconds " %
                         MAIN_LOOP_SLEEP_TIME)
            time.sleep(MAIN_LOOP_SLEEP_TIME)
Пример #17
0
    def syncDagrame(self):
        """同步数据到线上"""

        if int(time.time()) - self.beginTime > RUN_SYNC_INTERVAL_TIME:
            logging.info("**********sync crawl infos ************")
            sync = SyncCrawlInfos()
            sync.index()
            self.beginTime = int(time.time())
Пример #18
0
def mainLoop():
    """ 主循环,捕获异常,并重启rss """

    while True:
        try:
            sync = syncDagrame()
            sync.run()
        except Exception, e:
            logging.info("---------------main loop exception : %s " % e)
Пример #19
0
def mainLoop():
    """ 主循环,捕获异常,并重启rss """

    while True:
        try:
            rss = RssPool()
            rss.run()
        except Exception, e:
            logging.info("---------------main loop exception : %s " % e)
Пример #20
0
    def start_requests(self):

        spiderConfig = getCrawlNoRssRequest()
        if not spiderConfig:
            return []

        self.initConfig(spiderConfig)
        logging.info("*********meta******%s****************" % spiderConfig)
        return [Request(spiderConfig.get('start_urls', '')[0], callback=self.parse, dont_filter=True)]
Пример #21
0
    def start_requests(self):

        spiderConfig = getCrawlNoRssRequest()
        if not spiderConfig:
            return []

        self.initConfig(spiderConfig)
        logging.info("*********meta******%s****************" % spiderConfig)
        return [Request(spiderConfig.get('start_urls', '')[0], callback=self.parse, dont_filter=True)]
Пример #22
0
def mainLoop():
    """ 主循环,捕获异常,并重启rss """

    while True:
        try:
            sync = syncDagrame()
            sync.run()
        except Exception, e:
            logging.info("---------------main loop exception : %s " % e)
Пример #23
0
def getCrawlNoRssRequestLength():
    try:
        http = HttpRequest()
        url = requst_norss_length_url
        response = http.setUrl(url).setBody({}).encrypt([]).post()
        res = json.loads(response)['data']
        if res == 'null':
            res = None
    except Exception, e:
        logging.info("-----%s-----" % e)
        return None
Пример #24
0
def getCrawlNoRssRequestLength():
    try:
        http = HttpRequest()
        url = requst_norss_length_url
        response = http.setUrl(url).setBody({}).encrypt([]).post()
        res = json.loads(response)["data"]
        if res == "null":
            res = None
    except Exception, e:
        logging.info("-----%s-----" % e)
        return None
Пример #25
0
    def filterAndPackageDgrate(self):

        if not OPEN_REDIS_DISTINCT:
            return self.item

        uniqueCodeList = self.item.keys()
        repeatUniqueCode = requstDistinct(uniqueCodeList)
        logging.info('------------distinct before : %s ' % uniqueCodeList)
        for i, unique in enumerate(repeatUniqueCode):
            del (self.item[unique])
        logging.info('------------distinct after : %s ' % self.item.keys())
        return self.item
Пример #26
0
    def process_item(self, item):

        if not item:
            logging.info('------------page not crawl data ')
            return True

        self.item = item
        insertDataList = self.filterAndPackageDgrate()
        for index in insertDataList:
            self.db.insert(self.tableName, insertDataList[index])

        return True
Пример #27
0
    def process_item(self, item, spider):

        if not item:
            logging.info('-----------------------list page repeat ')
            return True

        self.item = item
        insertDataList = self.filterAndPackageDgrate()
        for index in insertDataList:
            self.db.insert(self.tableName, insertDataList[index])

        return True
Пример #28
0
    def process_item(self, item):

        if not item:
            logging.info('------------page not crawl data ')
            return True

        self.item = item
        insertDataList = self.filterAndPackageDgrate()
        for index in insertDataList:
            self.db.insert(self.tableName, insertDataList[index])

        return True
Пример #29
0
    def filterAndPackageDgrate(self):

        if not OPEN_REDIS_DISTINCT:
            return self.item

        uniqueCodeList = self.item.keys()
        repeatUniqueCode = requstDistinct(uniqueCodeList)
        logging.info('------------distinct before : %s ' % uniqueCodeList)
        for i, unique in enumerate(repeatUniqueCode):
            del(self.item[unique])
        logging.info('------------distinct after : %s ' % self.item.keys())
        return self.item
Пример #30
0
    def addRssSpider(self):

        configList = getCrawlRssRequest()
        if not configList:
            self.start = True
            return True

        try:
            spider = CommonFeedRss()
            self.pool.spawn(spider.run, configList)
        except Exception, e:
            logging.info("------------------add spider exception : %s " % e)
Пример #31
0
    def process_item(self, item, spider):

        if not item:
            logging.info('-----------------------list page repeat ')
            return True

        self.item = item
        insertDataList = self.filterAndPackageDgrate()
        for index in insertDataList:
            self.db.insert(self.tableName, insertDataList[index])

        return True
Пример #32
0
    def run(self, config):

        self.initConfig(config)
        d = feedparser.parse(config.get('start_urls', '')[0])

        # md5校验
        last_md5 = toMd5(d.entries)
        logging.info("*********last_md5 : %s   self.last_md5 : %s*****" % (last_md5, self.last_md5))
        if OPEN_MD5_CHECK and self.last_md5 == last_md5:
            return True

        self.parse(d)  # 解析rss
        syncLastMd5({'last_md5': last_md5, 'id': self.rule_id})
Пример #33
0
    def getNextListPageUrl(self, response):

        logging.info("*********next_request_url : %s   *****" % self.next_request_url)
        nextListPageURL = self.appendDomain(
            self.safeParse(response, self.next_request_url),
            response.url)  # .encode('utf-8'))
        logging.info("*********nextListPageURL : %s   *****" % nextListPageURL)

        requestUrl = []
        if nextListPageURL:
            requestUrl.append(
                Request(nextListPageURL, headers={'Referer': REFERER}, callback=self.parse, dont_filter=True))
        return requestUrl
Пример #34
0
def syncLastMd5(params):

    try:
        http = HttpRequest()
        url = sync_last_md5_url
        response = http.setUrl(url).setBody(params).encrypt([]).post()
        res = json.loads(response)['data']
        if res == 'null':
            res = None
    except Exception, e:
        print e
        logging.info("-----%s-----" % e)
        return None
Пример #35
0
def getCrawlRssRequest(params={}):

    try:
        http = HttpRequest()
        url = request_rss_url
        response = http.setUrl(url).setBody(params).encrypt([]).post()
        res = json.loads(response)["data"]
        if res == "null":
            res = None
    except Exception, e:
        print e
        logging.info("-----%s-----" % e)
        return None
Пример #36
0
def getCrawlRssRequest(params={}):

    try:
        http = HttpRequest()
        url = request_rss_url
        response = http.setUrl(url).setBody(params).encrypt([]).post()
        res = json.loads(response)['data']
        if res == 'null':
            res = None
    except Exception, e:
        print e
        logging.info("-----%s-----" % e)
        return None
Пример #37
0
def syncLastMd5(params):

    try:
        http = HttpRequest()
        url = sync_last_md5_url
        response = http.setUrl(url).setBody(params).encrypt([]).post()
        res = json.loads(response)["data"]
        if res == "null":
            res = None
    except Exception, e:
        print e
        logging.info("-----%s-----" % e)
        return None
Пример #38
0
    def parse_detail_page(self, response):

        logging.info('--------------------parse detail page-----------')
        item = XmlFeedItem()
        item['title'] = self.safeParse(response, self.titleXpath)

        imageAndDescriptionInfos = self.parseDescriptionAndImages(response)
        item['img_url'] = imageAndDescriptionInfos['img_url']
        item['description'] = imageAndDescriptionInfos['description']

        item['public_time'] = self.safeParse(response, self.pubDateXpath)
        item['source_url'] = self.appendDomain(self.safeParse(response, self.guidXpath), response.url)
        item['rule_id'] = self.rule_id
        yield item
Пример #39
0
    def run(self, config):

        self.initConfig(config)
        d = feedparser.parse(config.get('start_urls', '')[0])

        # md5校验
        last_md5 = toMd5(d.entries)
        logging.info("*********last_md5 : %s   self.last_md5 : %s*****" %
                     (last_md5, self.last_md5))
        if OPEN_MD5_CHECK and self.last_md5 == last_md5:
            return True

        self.parse(d)  # 解析rss
        syncLastMd5({'last_md5': last_md5, 'id': self.rule_id})
Пример #40
0
    def parse_detail_page(self, response):

        logging.info('--------------------parse detail page-----------')
        item = XmlFeedItem()
        item['title'] = self.safeParse(response, self.titleXpath)

        imageAndDescriptionInfos = self.parseDescriptionAndImages(response)
        item['img_url'] = imageAndDescriptionInfos['img_url']
        item['description'] = imageAndDescriptionInfos['description']

        item['public_time'] = self.safeParse(response, self.pubDateXpath)
        item['source_url'] = self.appendDomain(self.safeParse(response, self.guidXpath), response.url)
        item['rule_id'] = self.rule_id
        yield item
Пример #41
0
def requstDistinct(hashCode):
    try:
        http = HttpRequest()
        url = requst_distinct_url
        hashCode = ",".join(hashCode)
        body = {"field": hashCode}
        encryptFields = []
        response = http.setUrl(url).setBody(body).encrypt(encryptFields).post()
        res = json.loads(response)["data"]
        if not res:
            return []
        return res
    except Exception, e:
        res = []
        logging.info("-----------%s-------" % e)
        return res
Пример #42
0
def requstDistinct(hashCode):
    try:
        http = HttpRequest()
        url = requst_distinct_url
        hashCode = ",".join(hashCode)
        body = {'field': hashCode}
        encryptFields = []
        response = http.setUrl(url).setBody(body).encrypt(encryptFields).post()
        res = json.loads(response)['data']
        if not res:
            return []
        return res
    except Exception, e:
        res = []
        logging.info('-----------%s-------' % e)
        return res
Пример #43
0
    def run(self):

        while True:

            if (not self.start) and (not self.pool.full()):
                self.addRssSpider()
                # self.syncDagrame()
                continue

            self.start = False
            if self.pool.free_count() < RSS_MAX_POOL_NUM:
                logging.info("---------------join run ")
                self.pool.join()
            else:
                logging.info("---------------not data ,sleep %s senconds " % MAIN_LOOP_SLEEP_TIME)
                time.sleep(MAIN_LOOP_SLEEP_TIME)
Пример #44
0
    def parse_detail_page(self, response):

        logging.info('--------------------parse detail page-----------')
        item = CrawlItem()
        item['title'] = self.safeParse(response, self.titleXpath)

        imageAndContentInfos = self.parseContentAndImages(response)
        item['img_url'] = imageAndContentInfos['img_url']
        item['content'] = imageAndContentInfos['content']
        item['description'] = self.parseDescription(imageAndContentInfos['content'])

        item['source_score'] = self.parse_score(response)

        item['public_time'] = self.safeParse(response, self.pubDateXpath)
        item['source_url'] = response.url
        item['rule_id'] = self.rule_id
        yield item
Пример #45
0
    def parse_detail_page(self, response):

        logging.info('--------------------parse detail page-----------')
        item = CrawlItem()
        item['title'] = self.safeParse(response, self.titleXpath)

        imageAndContentInfos = self.parseContentAndImages(response)
        item['img_url'] = imageAndContentInfos['img_url']
        item['content'] = imageAndContentInfos['content']
        item['description'] = self.parseDescription(
            imageAndContentInfos['content'])

        item['source_score'] = self.parse_score(response)

        item['public_time'] = self.safeParse(response, self.pubDateXpath)
        item['source_url'] = response.url
        item['rule_id'] = self.rule_id
        yield item
Пример #46
0
    def getDetailPageUrls(self, response):

        detailUrls = [self.appendDomain(t.encode('utf-8'), response.url)
                      for t in self.safeParse(response, self.rule, True, False)]

        # 批量验证urls是否重复
        logging.info("*********detailUrls : %s   *****" % detailUrls)
        detailUrlsByFilter = self.distinctRequestUrls(detailUrls)
        logging.info("*********detailUrlsByFilter : %s   *****" % detailUrlsByFilter)

        if len(detailUrls) < 1 or len(detailUrlsByFilter) != len(detailUrls):
            self.isDone = True

        requestUrl = []
        if detailUrlsByFilter:
            for detailUrl in detailUrlsByFilter:
                requestUrl.append(
                    Request(detailUrl, headers={'Referer': REFERER}, callback=self.parse_detail_page, dont_filter=True))
        return requestUrl
Пример #47
0
def syncCrawlInfos(dataList):

    try:
        http = HttpRequest()
        http.setTimeout(900)
        url = sync_crawl_infos_url
        sqlList = json.dumps(dataList)
        body = {"sql": sqlList, "checksum": toMd5(sqlList)}
        encryptFields = []
        headerDict = {"Content-Encoding": "gzip", "Accept-Encoding": "gzip"}
        response = http.setUrl(url).setBody(body).setHeader(headerDict).encrypt(encryptFields).post()
        res = json.loads(response)["data"]
        if not res:
            return []
        return res
    except Exception, e:
        res = []
        logging.info("-----------%s-------" % e, True)
        return res
Пример #48
0
def syncCrawlInfos(dataList):

    try:
        http = HttpRequest()
        http.setTimeout(900)
        url = sync_crawl_infos_url
        sqlList = json.dumps(dataList)
        body = {'sql': sqlList, 'checksum': toMd5(sqlList)}
        encryptFields = []
        headerDict = {'Content-Encoding': 'gzip', 'Accept-Encoding': "gzip"}
        response = http.setUrl(url).setBody(body).setHeader(
            headerDict).encrypt(encryptFields).post()
        res = json.loads(response)['data']
        if not res:
            return []
        return res
    except Exception, e:
        res = []
        logging.info('-----------%s-------' % e, True)
        return res
Пример #49
0
    def process_item(self, item, spider):

        if not item:
            logging.info('-----------------------list page repeat : %s' % item)
            return True

        public_time = int(time.time())
        create_time = int(time.time())

        for i in xrange(0, len(item['url'])):
            insertData = {
                'title': item['title'][i],
                'url': item['url'][i],
                'unique_code': toMd5(item['url'][i]),
                'share_num': item['share_num'][i],
                'rss_num': item['rss_num'][i],
                'public_time': public_time,
                'create_time': create_time
            }
            self.db.insert(self.tableName, insertData)

        return True
Пример #50
0
    def process_item(self, item, spider):

        if not item:
            logging.info('-----------------------list page repeat : %s' % item)
            return True

        public_time = int(time.time())
        create_time = int(time.time())

        for i in xrange(0, len(item['url'])):
            insertData = {
                'title': item['title'][i],
                'url': item['url'][i],
                'unique_code': toMd5(item['url'][i]),
                'share_num': item['share_num'][i],
                'rss_num': item['rss_num'][i],
                'public_time': public_time,
                'create_time': create_time
            }
            self.db.insert(self.tableName, insertData)

        return True
Пример #51
0
    def getNextListPageUrl(self, response):

        requestUrl = []
        self.max_deepth -= 1
        if self.max_deepth < 1:
            logging.info("*********max_deepth : %s   *****" % self.max_deepth)
            return requestUrl

        # logging.info("*********next_request_url : %s   *****" % self.next_request_url)
        nextListPageURL = self.safeParse(response, self.next_request_url)

        # logging.info("*********next_page_url_prefix : %s   *****" % self.next_page_url_prefix)
        if self.next_page_url_prefix:
            nextListPageURL = self.appendDomain(nextListPageURL, self.next_page_url_prefix, False)
        else:
            nextListPageURL = self.appendDomain(nextListPageURL, response.url)

        logging.info("*********nextListPageURL : %s   *****" % nextListPageURL)

        if nextListPageURL:
            requestUrl.append(
                Request(nextListPageURL, headers={'Referer': REFERER}, callback=self.parse, dont_filter=True))
        return requestUrl
Пример #52
0
 def run(self):
     while True:
         num = getCrawlNoRssRequestLength()
         logging.info("********need deal request num : %s " % num)
         if not num:
             if self.runNum >= 1:
                 logging.info("*****************size:%s********runNum:%s********" % (self.size, self.runNum))
                 self.runSpider()
             break
         else:
             self.initSpider()
             if self.runNum >= self.size:
                 logging.info("*****************size:%s********runNum:%s********" % (self.size, self.runNum))
                 self.runSpider()
                 break
Пример #53
0
 def run(self):
     while True:
         num = getCrawlNoRssRequestLength()
         logging.info("********need deal request num : %s " % num)
         if not num:
             if self.runNum >= 1:
                 logging.info(
                     "*****************size:%s********runNum:%s********" %
                     (self.size, self.runNum))
                 self.runSpider()
             break
         else:
             self.initSpider()
             if self.runNum >= self.size:
                 logging.info(
                     "*****************size:%s********runNum:%s********" %
                     (self.size, self.runNum))
                 self.runSpider()
                 break
Пример #54
0
def startScript():
    times = 0
    # beginTime = int(time.time())
    while True:
        try:
            times += 1
            num = getCrawlNoRssRequestLength()
            logging.info("**********need deal request num :%s************" % num)

            if not num:
                logging.info("**********sleep:%s************" % MAIN_LOOP_SLEEP_TIME)
                time.sleep(MAIN_LOOP_SLEEP_TIME)
            else:
                os.system('python runSpider.py')

            # if times > RUN_SYNC_INTERVAL_TIMES or int(time.time()) - beginTime > RUN_SYNC_INTERVAL_TIME:
            #     logging.info("**********sync crawl infos ************")
            #     sync = SyncCrawlInfos()
            #     sync.index()
            #     times = 0
            #     beginTime = int(time.time())

        except Exception, e:
            logging.info("--------------%s------------" % e)