def parse_item(self, response): #解析出需要下载的包 topic_url = response.request.meta['topic_url'] try: urls = re.findall('<a.*?href="(.*?)".*?>.*\.[apk|zip|rar].*?</a>', response.body, re.I) print urls if urls == []: print 'this url->%s has not apk file' % response.request.meta['topic_url'] #如果没有apk文件,该主题链接失去意义,更新is_crawled=1 update_topic_url = sql.topicurl_withcrawed(topic_url) status = handle_db(update_topic_url) print status for url in set(urls): url = 'http://bbs.gfan.com/%s' % url request = response.request.replace(url=url, callback=self.get_attachementpath) request.meta['url'] = response.url yield request return except IndexError, e: traceback.print_exc() #没有 apk的下载包的地方,更新该链接抓取状态为1 update_crawled_sql = sql.topicurl_withcrawed(response.url) status = handler_db(update_crawled_sql) if status['errorNo'] == 1: print 'this url has no apk' return
def parse_item(self, response): #解析出需要下载的包 log.msg(response.url, log.INFO) try: topic_url = response.request.meta['topic_url'] if re.findall(u'指定的主题不存在或已被删除或正在被审核,请返回'.encode('gbk', 'ignore'), response.body, re.I): #执行更新topic操作 log.msg(('this topic_url(%s) has be removed by admin' % response.url), log.INFO) update_topic_url = sql.topicurl_withcrawed(topic_url) status = handle_db(update_topic_url) return topic_url = response.request.meta['topic_url'] except: log.msg(str(traceback.print_exc()), log.INFO) try: urls = re.findall('<a href="(.*?)".*\.apk</a>', response.body, re.I) if urls == []: #必须先回复在下载的数据 reply_urls = re.findall(u'如果你要查看本帖隐藏内容请'.encode('gbk', 'ignore'), response.body, re.I) #判断是否有匹配必须回复的,有就更新抓取级别为-1,单独使用回复加下载一体模块查找级别为-1的模块 print 'reply_urls' print reply_urls if reply_urls != []: update_topic_priority_sql = sql.update_topic_priority(self.platform, topic_url, '-1') n = handle_db(update_topic_priority_sql) log.msg(str(n), log.INFO) return log.msg(('this url->%s has not apk file' % response.request.meta['topic_url']), log.INFO) #如果没有apk文件,该主题链接失去意义,更新is_crawled=1 update_topic_url = sql.topicurl_withcrawed(topic_url) status = handle_db(update_topic_url) #回复成功后,没有下载链接,本次登陆错操作不能中止,继续执行 self.topic_reply_num -= 1 log.msg(('reply success, will download software%s' % str(self.topic_reply_num)), log.DEBUG) log.msg(str(response), log.DEBUG) request = self.get_down_topic(response, url) yield request else: for url in set(urls): url = url.replace('amp;', '') print 'url:', url request = response.request.replace(url=url, callback=self.get_downloadpath) request.meta['url'] = response.url yield request except IndexError, e: log.msg(str(traceback.print_exc()), log.ERROR) #没有 apk的下载包的地方,更新该链接抓取状态为1 update_crawled_sql = sql.topicurl_withcrawed(response.url) status = handler_db(update_crawled_sql) if status['errorNo'] == 1: log.msg('this url has no apk', log.INFO) return
def parse_item(self, response): #解析出需要下载的包 log.msg(response.url, log.INFO) try: if re.findall(u'抱歉,指定的主题不存在或已被删除或正在被审核'.encode('gbk', 'ignore'), response.body, re.I): #执行更新topic操作 log.msg(('this topic_url(%s) has be removed by admin' % response.url), log.INFO) update_topic_url = sql.topicurl_withcrawed(topic_url) status = handle_db(update_topic_url) return topic_url = response.request.meta['topic_url'] except: log.msg(str(traceback.print_exc()), log.INFO) try: urls = re.findall('<a href="(forum.php?[^"]+)" target="_blank">.*?\.apk</a>', response.body, re.I) if urls == []: #没有回复不能下载 noreply_regex = u'如果您要查看本帖隐藏内容请'.encode('gbk', 'ignore') noreply = re.findall(noreply_regex, response.body, re.I) for i in noreply: print i.decode('gbk','ignore') if noreply != []: #需要执行更新topic方法,avail字段为-1 update_topic_priority_sql = sql.update_topic_priority(self.platform, topic_url, '-1') n = handle_db(update_topic_priority_sql) log.msg(''.join(['hide_apk_update topic_url priority=-1', str(n)]), log.INFO) return else: log.msg(('this url->%s has not apk file' % response.request.meta['topic_url']), log.INFO) #如果没有apk文件,该主题链接失去意义,更新is_crawled=1 update_topic_url = sql.topicurl_withcrawed(topic_url) status = handle_db(update_topic_url) return else: for url in set(urls): url = 'http://bbs.mumayi.com/%s' % url request = response.request.replace(url=url, callback=self.get_apk) request.meta['url'] = response.url yield request except IndexError, e: log.msg(str(traceback.print_exc()), log.ERROR) #没有 apk的下载包的地方,更新该链接抓取状态为1 update_crawled_sql = sql.topicurl_withcrawed(response.url) status = handler_db(update_crawled_sql) if status['errorNo'] == 1: log.msg('this url has no apk', log.INFO) return
def get_apk(self, response): ''' 将下载地址下载的包写入文件,并且 ''' filename = response.request.meta['filename'] filename = ''.join([str(random.randrange(1,100000)), '.', filename]) # if os.path.exists(''.join([os.getcwd(), os.sep, 'xda', os.sep, 'download', os.sep, time.strftime('%Y-%m-%d'),os.sep])) is False: # os.mkdir(''.join([os.getcwd(), os.sep, 'xda', os.sep, 'download', os.sep, time.strftime('%Y-%m-%d'),os.sep])) #调用新建文件夹的方法 down_dir = utils.make_spiderdir(self.name, 'download') #解码:有的文件可能是utf-8编码,解为unicode try: filename = filename.decode('utf-8', 'ignore') except: pass filename = ''.join([down_dir, os.sep, str(time.time()).split('.')[0], filename]) open(filename, 'wb').write(response.body) #下载后在存数据库,确保准确性 topic_url = response.request.meta['topic_url'] hashurl = sql.hash_topic(topic_url) updatedate = time.strftime('%Y-%m-%d %H:%M:%S') filename = filename.replace('\\', '\\\\') insert_sql = sql.insert_softwareinfo(self.name, hashurl, topic_url, updatedate, filename) status = db.handle_db(insert_sql) log.msg(str(status), log.DEBUG) #更新topic数据库表 update_topic_url = sql.topicurl_withcrawed(topic_url) status = db.handle_db(update_topic_url) log.msg(str(status), log.DEBUG) #备份目录 try: autocopy.copy(filename, self.name) log.msg('copy job is successed', log.INFO) except: log.msg('copy job is failture', log.ERROR)
def parse_item(self, response): #解析出需要下载的包 log.msg(response.url, log.INFO) try: if re.findall(u'抱歉,指定的主题不存在或已被删除或正在被审核'.encode('gbk', 'ignore'), response.body, re.I): #执行更新topic操作 log.msg(('this topic_url(%s) has be removed by admin' % response.url), log.INFO) update_topic_url = sql.topicurl_withcrawed(topic_url) status = handle_db(update_topic_url) return topic_url = response.request.meta['topic_url'] except: log.msg(str(traceback.print_exc()), log.INFO) try: urls = re.findall('<a href="(forum.php?[^"]+)".*>.*?\.apk', response.body, re.I) if urls == []: #没有回复不能下载 log.msg(('this url->%s has not apk file' % response.request.meta['topic_url']), log.INFO) #如果没有apk文件,该主题链接失去意义,更新is_crawled=1 update_topic_url = sql.topicurl_withcrawed(topic_url) status = handle_db(update_topic_url) request_topic = self.repeat_reply(response) yield request_topic else: for url in set(urls): url = 'http://bbs.mumayi.com/%s' % url url = url.replace('amp;', '') request = response.request.replace(url=url, method='get', callback=self.get_apk) request.meta['url'] = response.url yield request except IndexError, e: log.msg(str(traceback.print_exc()), log.ERROR) #没有 apk的下载包的地方,更新该链接抓取状态为1 update_crawled_sql = sql.topicurl_withcrawed(response.url) status = handler_db(update_crawled_sql) if status['errorNo'] == 1: log.msg('this url has no apk', log.INFO) return
def parse_item(self, response): #解析出需要下载的包 log.msg(response.url, log.INFO) try: topic_url = response.request.meta['topic_url'] if re.findall('指定的主题不存在或已被删除或正在被审核', response.body, re.I): #执行更新topic操作 log.msg(('this topic_url(%s) has be removed by admin' % response.url), log.INFO) update_topic_url = sql.topicurl_withcrawed(topic_url) status = handle_db(update_topic_url) return topic_url = response.request.meta['topic_url'] except: log.msg(str(traceback.print_exc()), log.INFO) try: urls = re.findall('<a href="(attachment.php?[^"]+)".*?>.*\.apk', response.body, re.I) if urls == []: log.msg(('this url->%s has not apk file' % response.request.meta['topic_url']), log.INFO) #如果没有apk文件,该主题链接失去意义,更新is_crawled=1 update_topic_url = sql.topicurl_withcrawed(topic_url) status = handle_db(update_topic_url) return else: for url in set(urls): url = 'http://bbs.gfan.com/%s' % url request = response.request.replace(url=url, callback=self.get_attachementpath) request.meta['url'] = response.url yield request except IndexError, e: log.msg(str(traceback.print_exc()), log.ERROR) #没有 apk的下载包的地方,更新该链接抓取状态为1 update_crawled_sql = sql.topicurl_withcrawed(response.url) status = handler_db(update_crawled_sql) if status['errorNo'] == 1: log.msg('this url has no apk', log.INFO) return
def get_downloadpath(self, response): topic_url = response.request.meta['topic_url'] log.msg(('get_downloadpath,topic_url', response.request.meta['topic_url']), log.INFO) try: #获取下载地址的前一个地址是response.url url = re.findall(u'<a href="(http://bbs\.goapk\.com/forum\.php\?mod=attachment.*aid=.*)">', response.body, re.I)[0] #文件名,zanshi url = url.replace('amp;', '') request = response.request.replace(url=url, callback=self.get_apk) return request except: traceback.print_exc() #这里发生异常,表示抓取不到该文件名吗? update_topic_url = sql.topicurl_withcrawed(topic_url) status = handle_db(update_topic_url) print status
def get_downloadpath(self, response): print 'get_downloadpath,topic_url', response.request.meta['topic_url'] try: #获取下载地址的前一个地址是response.url url = re.findall('<p class="alert_btnleft"><a href="(.*?)">', response.body, re.I)[0] url = 'http://bbs.gfan.com/%s' % url #文件名,zanshi file_name = re.findall('<div class="alert_info">\s+<p>.*?“(.*?)”', response.body, re.I)[0] request = response.request.replace(url=url, callback=self.get_apk) request.meta['filename'] = file_name return request except: traceback.print_exc() #这里发生异常,表示抓取不到该文件名吗? update_topic_url = sql.topicurl_withcrawed(topic_url) status = handle_db(update_topic_url) print status
def get_code(self, response): #网站管理员对主题页内容进行了删除操作 if re.findall('指定的主题不存在或已被删除或正在被审核', response.body, re.I): #执行更新topic操作 print 'this topic_url(%s) has be removed by admin' % response.url update_topic_url = sql.topicurl_withcrawed(topic_url) status = handle_db(update_topic_url) print status return print 'response.url',response.url request_code = response.request.replace(url = 'http://bbs.gfan.com/ajax.php?action=newupdatesecqaa&inajax=0&ajaxtarget=secanswer_menu_content',\ callback=self.ay_code) request_code.meta['response'] = response #request_code.meta['proxy'] = self.proxy_url response.request.meta['response'] = response yield request_code
def get_apk(self, response): filename = ''.join([str(random.randrange(1,100000)), '.apk']) # if os.path.exists(''.join([os.getcwd(), os.sep, 'gfan', os.sep, 'download', os.sep, time.strftime('%Y-%m-%d'),os.sep])) is False: # os.mkdir(''.join([os.getcwd(), os.sep, 'gfan', os.sep, 'download', os.sep, time.strftime('%Y-%m-%d'),os.sep])) #调用新建文件夹的方法 down_dir = utils.make_spiderdir(self.platform, 'download') #解码:有的文件可能是utf-8编码,解为unicode try: filename = filename.decode('utf-8', 'ignore') except: pass filename = ''.join([down_dir, os.sep, str(time.time()).split('.')[0], filename]) open(filename, 'wb').write(response.body) #下载后在存数据库,确保准确性 topic_url = response.request.meta['topic_url'] hashurl = sql.hash_topic(topic_url) updatedate = time.strftime('%Y-%m-%d %H:%M:%S') filename = filename.replace('\\', '\\\\') insert_sql = sql.insert_softwareinfo(self.platform, hashurl, topic_url, updatedate, filename) status = handle_db(insert_sql) log.msg(str(status), log.DEBUG) #更新topic数据库表 update_topic_url = sql.topicurl_withcrawed(topic_url) status = handle_db(update_topic_url) log.msg(str(status), log.DEBUG) #能进入本方法执行,表示已经下载了该response.积分-1 account_sql = sql.select_accountbyusername(self.username, self.platform) point_num = handle_db(account_sql)['data'][0][5] point_num -= 1 #然后执行更新 update_account_pointsql = sql.update_account_point(self.username, self.platform, point_num) n = handle_db(update_account_pointsql) if n['errorNo'] == 0: log.msg(('<username: %s \'s integral is : -1 ,now integral is %s>' % (self.username, self.reply_num)), log.INFO) try: autocopy.copy(filename, self.platform) log.msg('copy job is successed', log.INFO) except: log.msg(str(traceback.print_exc()), log.ERROR) log.msg('copy job is failture', log.ERROR) request_topic = self.repeat_reply(response) return request_topic
def get_downloadpath(self, response): topic_url = response.request.meta['topic_url'] log.msg(('get_downloadpath,topic_url', response.request.meta['topic_url']), log.INFO) try: #获取下载地址的前一个地址是response.url url = re.findall('<a href="(attachment.php?[^"]+)".*?>', response.body, re.I)[0] url = 'http://bbs.gfan.com/%s' % url #文件名,zanshi file_name = re.findall('“.*\.(.*?)”', response.body, re.I)[0] request = response.request.replace(url=url, callback=self.get_apk) request.meta['filename'] = file_name return request except: traceback.print_exc() #这里发生异常,表示抓取不到该文件名吗? update_topic_url = sql.topicurl_withcrawed(topic_url) status = handle_db(update_topic_url) print status
def get_apk(self, response): filename = response.request.meta['filename'] if os.path.exists(''.join([os.getcwd(), os.sep, 'gfan', os.sep, 'download', os.sep, time.strftime('%Y-%m-%d'),os.sep])) is False: os.mkdir(''.join([os.getcwd(), os.sep, 'gfan', os.sep, 'download', os.sep, time.strftime('%Y-%m-%d'),os.sep])) filename = ''.join([os.getcwd(), os.sep, 'gfan', os.sep, 'download', os.sep, time.strftime('%Y-%m-%d'), os.sep, str(time.time()).split('.')[0], filename]) open(filename, 'wb').write(response.body) #下载后在存数据库,确保准确性 topic_url = response.request.meta['topic_url'] hashurl = sql.hash_topic(topic_url) updatedate = time.strftime('%Y-%m-%d %H:%M:%S') filename = filename.replace('\\', '\\\\') insert_sql = sql.insert_softwareinfo(self.platform, hashurl, topic_url, updatedate, filename) status = handle_db(insert_sql) print status #更新topic数据库表 update_topic_url = sql.topicurl_withcrawed(topic_url) status = handle_db(update_topic_url) print status
def get_code(self, response): if self.topic_reply_num < 1: return time.sleep(20) #网站管理员对主题页内容进行了删除操作 if re.findall('指定的主题不存在或已被删除或正在被审核'.decode('utf-8', 'ignore').encode('utf-8', 'ignore'), response.body, re.I): #执行更新topic操作 log.msg(('this topic_url(%s) has be removed by admin' % response.url), log.INFO) update_topic_url = sql.topicurl_withcrawed(topic_url) status = handle_db(update_topic_url) #回调本方法 request_topic = self.repeat_reply(response) return request_topic message = self.get_othermessage(response) if len(message) < 10: message = ''.join([re.sub('<.*?>','', message), '..........'] ) try: posttime = str(time.time()).split('.')[0] hash = getFormHash(response.body) formdata = {"message" : message,'posttime' : posttime, 'formhash' : hash, 'subject' : ''} formrequest = FormRequest.from_response(response=response, formnumber=1,formdata = formdata,dont_click=True,) formrequest.callback = self.reply_status print 'response.url---->',response.url #url = self.get_reply_url(response) #if url: # formrequest._url = url #else: # self.repeat_reply(response) #formrequest._url = '%s%s' % (formrequest.url, '&inajax=1') formrequest.meta['topic_url'] = response.request.meta['topic_url'] formrequest.meta['topic_response'] = response.request.meta['topic_response'] #formrequest.meta['proxy'] = self.proxy_url return [formrequest] except: #发生无法回复的异常如何处理?比如无form,发生IndexError错误.回调get_topic request_topic = self.reply_status(response) return [request_topic]
def get_code(self, response): if self.topic_reply_num < 1: return time.sleep(20) #网站管理员对主题页内容进行了删除操作 if re.findall('指定的主题不存在或已被删除或正在被审核', response.body, re.I): #执行更新topic操作 topic_url = response.request.meta['topic_url'] log.msg(('this topic_url(%s) has be removed by admin' % response.url), log.INFO) update_topic_url = sql.topicurl_withcrawed(topic_url) status = handle_db(update_topic_url) #回调本方法 request_topic = self.repeat_reply(response) return request_topic log.msg(('response.url',response.url), log.INFO) request_code = response.request.replace(url = 'http://bbs.gfan.com/ajax.php?action=newupdatesecqaa&inajax=0&ajaxtarget=secanswer_menu_content',\ callback=self.ay_code) request_code.meta['response'] = response #request_code.meta['proxy'] = self.proxy_url #response.request.meta['response'] = response return request_code
def pass_topic(self, response): #查找含有下载链接的主题页面 noauth = re.findall(('%s' % settings.TRAIT).decode('utf-8', 'ignore').encode('%s' % settings.CODE), response.body, re.I) referer_url = response.request.headers.get('Referer', None) if noauth: i = SrcItem() #第一次访问过此页面 topic_url = response.url i['topic_url'] = topic_url i['referer_url'] = referer_url i['spider_name'] = settings.SPIDERNAME yield i else: try: urls = re.findall(settings.DOWN_REG, response.body, re.I) if urls == []: log.msg(('this url->%s has not apk file' % response.url), log.INFO) yield else: for url in set(urls): url = '%s%s' % (settings.DOMAIN, url) print 'download_url', url url = url.replace('amp;', '') request = response.request.replace(url=url, method='get', callback=self.get_apk) request.meta['referer_url'] = referer_url yield request except IndexError, e: log.msg(str(traceback.print_exc()), log.ERROR) #没有 apk的下载包的地方,更新该链接抓取状态为1 update_crawled_sql = sql.topicurl_withcrawed(response.url) status = handler_db(update_crawled_sql) if status['errorNo'] == 1: log.msg('this url has no apk', log.INFO) yield