def parse_item(self, response): #解析出需要下载的包 log.msg(response.url, log.INFO) try: topic_url = response.request.meta['topic_url'] if re.findall(u'指定的主题不存在或已被删除或正在被审核,请返回'.encode('gbk', 'ignore'), response.body, re.I): #执行更新topic操作 log.msg(('this topic_url(%s) has be removed by admin' % response.url), log.INFO) update_topic_url = sql.topicurl_withcrawed(topic_url) status = handle_db(update_topic_url) return topic_url = response.request.meta['topic_url'] except: log.msg(str(traceback.print_exc()), log.INFO) try: urls = re.findall('<a href="(.*?)".*\.apk</a>', response.body, re.I) if urls == []: #必须先回复在下载的数据 reply_urls = re.findall(u'如果你要查看本帖隐藏内容请'.encode('gbk', 'ignore'), response.body, re.I) #判断是否有匹配必须回复的,有就更新抓取级别为-1,单独使用回复加下载一体模块查找级别为-1的模块 print 'reply_urls' print reply_urls if reply_urls != []: update_topic_priority_sql = sql.update_topic_priority(self.platform, topic_url, '-1') n = handle_db(update_topic_priority_sql) log.msg(str(n), log.INFO) return log.msg(('this url->%s has not apk file' % response.request.meta['topic_url']), log.INFO) #如果没有apk文件,该主题链接失去意义,更新is_crawled=1 update_topic_url = sql.topicurl_withcrawed(topic_url) status = handle_db(update_topic_url) #回复成功后,没有下载链接,本次登陆错操作不能中止,继续执行 self.topic_reply_num -= 1 log.msg(('reply success, will download software%s' % str(self.topic_reply_num)), log.DEBUG) log.msg(str(response), log.DEBUG) request = self.get_down_topic(response, url) yield request else: for url in set(urls): url = url.replace('amp;', '') print 'url:', url request = response.request.replace(url=url, callback=self.get_downloadpath) request.meta['url'] = response.url yield request except IndexError, e: log.msg(str(traceback.print_exc()), log.ERROR) #没有 apk的下载包的地方,更新该链接抓取状态为1 update_crawled_sql = sql.topicurl_withcrawed(response.url) status = handler_db(update_crawled_sql) if status['errorNo'] == 1: log.msg('this url has no apk', log.INFO) return
def parse_item(self, response): #解析出需要下载的包 log.msg(response.url, log.INFO) try: if re.findall(u'抱歉,指定的主题不存在或已被删除或正在被审核'.encode('gbk', 'ignore'), response.body, re.I): #执行更新topic操作 log.msg(('this topic_url(%s) has be removed by admin' % response.url), log.INFO) update_topic_url = sql.topicurl_withcrawed(topic_url) status = handle_db(update_topic_url) return topic_url = response.request.meta['topic_url'] except: log.msg(str(traceback.print_exc()), log.INFO) try: urls = re.findall('<a href="(forum.php?[^"]+)" target="_blank">.*?\.apk</a>', response.body, re.I) if urls == []: #没有回复不能下载 noreply_regex = u'如果您要查看本帖隐藏内容请'.encode('gbk', 'ignore') noreply = re.findall(noreply_regex, response.body, re.I) for i in noreply: print i.decode('gbk','ignore') if noreply != []: #需要执行更新topic方法,avail字段为-1 update_topic_priority_sql = sql.update_topic_priority(self.platform, topic_url, '-1') n = handle_db(update_topic_priority_sql) log.msg(''.join(['hide_apk_update topic_url priority=-1', str(n)]), log.INFO) return else: log.msg(('this url->%s has not apk file' % response.request.meta['topic_url']), log.INFO) #如果没有apk文件,该主题链接失去意义,更新is_crawled=1 update_topic_url = sql.topicurl_withcrawed(topic_url) status = handle_db(update_topic_url) return else: for url in set(urls): url = 'http://bbs.mumayi.com/%s' % url request = response.request.replace(url=url, callback=self.get_apk) request.meta['url'] = response.url yield request except IndexError, e: log.msg(str(traceback.print_exc()), log.ERROR) #没有 apk的下载包的地方,更新该链接抓取状态为1 update_crawled_sql = sql.topicurl_withcrawed(response.url) status = handler_db(update_crawled_sql) if status['errorNo'] == 1: log.msg('this url has no apk', log.INFO) return