def get_apk(self, response): topic_url = referer_url = response.request.meta['referer_url'] update_date = time.strftime('%Y-m-%d %H:%M:%S') is_crawled = '1' priority_rating = '0' filename = ''.join([str(random.randrange(1,100000)), '.apk']) # if os.path.exists(''.join([os.getcwd(), os.sep, 'gfan', os.sep, 'download', os.sep, time.strftime('%Y-%m-%d'),os.sep])) is False: # os.mkdir(''.join([os.getcwd(), os.sep, 'gfan', os.sep, 'download', os.sep, time.strftime('%Y-%m-%d'),os.sep])) #调用新建文件夹的方法 down_dir = utils.make_spiderdir(self.platform, 'download') #解码:有的文件可能是utf-8编码,解为unicode try: filename = filename.decode('utf-8', 'ignore') except: pass filename = ''.join([down_dir, os.sep, str(time.time()).split('.')[0], filename]) open(filename, 'wb').write(response.body) #下载后在存数据库,确保准确性 hashurl = sql.hash_topic(topic_url) updatedate = time.strftime('%Y-%m-%d %H:%M:%S') filename = filename.replace('\\', '\\\\') insert_sql = sql.insert_softwareinfo(self.platform, hashurl, topic_url, updatedate, filename) status = handle_db(insert_sql) log.msg(str(status), log.DEBUG) #更新topic数据库表 insert_sql = sql.insert_topic_url(self.platform, topic_url,referer_url, updatedate, is_crawled, priority_rating) statusn = handle_db(insert_sql) log.msg(str(status), log.DEBUG) try: autocopy.copy(filename, self.platform) log.msg('copy job is successed', log.INFO) except: log.msg(str(traceback.print_exc()), log.ERROR) log.msg('copy job is failture', log.ERROR)
def reply_status(self, response): #重新返回首页,对特别是需要回复的模块特别有用,而不需要回复的帖子缺对性能是个损失. #查询回复状态 success = u'非常感谢,你的回复已经发布'.encode('gbk', 'ignore') status = re.findall(success, response.body) username_sql = sql.select_accountbyusername(self.username, self.platform) #回复数量限制 failture = u'对不起,您所在的用户组每小时限制发帖 '.encode('gbk', 'ignore') failture_status = re.findall(failture, response.body, re.I) if failture_status: return if status: log.msg('reply success', log.INFO) reply_nums = handle_db(username_sql)['data'][0] self.point_num = reply_nums[5] self.reply_num = reply_nums[-2] self.reply_num += 1 self.point_num += 1 #回复成功.账号reply_num+1,积分+1 try: update_replynum = sql.update_account_username(self.username, self.platform, self.reply_num, self.point_num) update_success = handle_db(update_replynum) log.msg(('<username: %s \'s integral is : +1 ,now integral is %s>' % (self.username, self.reply_num)), log.INFO) except Exception, e: log.msg(str(traceback.print_exc()), log.ERROR)
def get_apk(self, response): ''' 将下载地址下载的包写入文件,并且 ''' filename = response.request.meta['filename'] filename = ''.join([str(random.randrange(1,100000)), '.', filename]) # if os.path.exists(''.join([os.getcwd(), os.sep, 'xda', os.sep, 'download', os.sep, time.strftime('%Y-%m-%d'),os.sep])) is False: # os.mkdir(''.join([os.getcwd(), os.sep, 'xda', os.sep, 'download', os.sep, time.strftime('%Y-%m-%d'),os.sep])) #调用新建文件夹的方法 down_dir = utils.make_spiderdir(self.name, 'download') #解码:有的文件可能是utf-8编码,解为unicode try: filename = filename.decode('utf-8', 'ignore') except: pass filename = ''.join([down_dir, os.sep, str(time.time()).split('.')[0], filename]) open(filename, 'wb').write(response.body) #下载后在存数据库,确保准确性 topic_url = response.request.meta['topic_url'] hashurl = sql.hash_topic(topic_url) updatedate = time.strftime('%Y-%m-%d %H:%M:%S') filename = filename.replace('\\', '\\\\') insert_sql = sql.insert_softwareinfo(self.name, hashurl, topic_url, updatedate, filename) status = db.handle_db(insert_sql) log.msg(str(status), log.DEBUG) #更新topic数据库表 update_topic_url = sql.topicurl_withcrawed(topic_url) status = db.handle_db(update_topic_url) log.msg(str(status), log.DEBUG) #备份目录 try: autocopy.copy(filename, self.name) log.msg('copy job is successed', log.INFO) except: log.msg('copy job is failture', log.ERROR)
def parse_item(self, response): #解析出需要下载的包 log.msg(response.url, log.INFO) try: topic_url = response.request.meta['topic_url'] if re.findall(u'指定的主题不存在或已被删除或正在被审核,请返回'.encode('gbk', 'ignore'), response.body, re.I): #执行更新topic操作 log.msg(('this topic_url(%s) has be removed by admin' % response.url), log.INFO) update_topic_url = sql.topicurl_withcrawed(topic_url) status = handle_db(update_topic_url) return topic_url = response.request.meta['topic_url'] except: log.msg(str(traceback.print_exc()), log.INFO) try: urls = re.findall('<a href="(.*?)".*\.apk</a>', response.body, re.I) if urls == []: #必须先回复在下载的数据 reply_urls = re.findall(u'如果你要查看本帖隐藏内容请'.encode('gbk', 'ignore'), response.body, re.I) #判断是否有匹配必须回复的,有就更新抓取级别为-1,单独使用回复加下载一体模块查找级别为-1的模块 print 'reply_urls' print reply_urls if reply_urls != []: update_topic_priority_sql = sql.update_topic_priority(self.platform, topic_url, '-1') n = handle_db(update_topic_priority_sql) log.msg(str(n), log.INFO) return log.msg(('this url->%s has not apk file' % response.request.meta['topic_url']), log.INFO) #如果没有apk文件,该主题链接失去意义,更新is_crawled=1 update_topic_url = sql.topicurl_withcrawed(topic_url) status = handle_db(update_topic_url) #回复成功后,没有下载链接,本次登陆错操作不能中止,继续执行 self.topic_reply_num -= 1 log.msg(('reply success, will download software%s' % str(self.topic_reply_num)), log.DEBUG) log.msg(str(response), log.DEBUG) request = self.get_down_topic(response, url) yield request else: for url in set(urls): url = url.replace('amp;', '') print 'url:', url request = response.request.replace(url=url, callback=self.get_downloadpath) request.meta['url'] = response.url yield request except IndexError, e: log.msg(str(traceback.print_exc()), log.ERROR) #没有 apk的下载包的地方,更新该链接抓取状态为1 update_crawled_sql = sql.topicurl_withcrawed(response.url) status = handler_db(update_crawled_sql) if status['errorNo'] == 1: log.msg('this url has no apk', log.INFO) return
def parse_item(self, response): #解析出需要下载的包 log.msg(response.url, log.INFO) try: if re.findall(u'抱歉,指定的主题不存在或已被删除或正在被审核'.encode('gbk', 'ignore'), response.body, re.I): #执行更新topic操作 log.msg(('this topic_url(%s) has be removed by admin' % response.url), log.INFO) update_topic_url = sql.topicurl_withcrawed(topic_url) status = handle_db(update_topic_url) return topic_url = response.request.meta['topic_url'] except: log.msg(str(traceback.print_exc()), log.INFO) try: urls = re.findall('<a href="(forum.php?[^"]+)" target="_blank">.*?\.apk</a>', response.body, re.I) if urls == []: #没有回复不能下载 noreply_regex = u'如果您要查看本帖隐藏内容请'.encode('gbk', 'ignore') noreply = re.findall(noreply_regex, response.body, re.I) for i in noreply: print i.decode('gbk','ignore') if noreply != []: #需要执行更新topic方法,avail字段为-1 update_topic_priority_sql = sql.update_topic_priority(self.platform, topic_url, '-1') n = handle_db(update_topic_priority_sql) log.msg(''.join(['hide_apk_update topic_url priority=-1', str(n)]), log.INFO) return else: log.msg(('this url->%s has not apk file' % response.request.meta['topic_url']), log.INFO) #如果没有apk文件,该主题链接失去意义,更新is_crawled=1 update_topic_url = sql.topicurl_withcrawed(topic_url) status = handle_db(update_topic_url) return else: for url in set(urls): url = 'http://bbs.mumayi.com/%s' % url request = response.request.replace(url=url, callback=self.get_apk) request.meta['url'] = response.url yield request except IndexError, e: log.msg(str(traceback.print_exc()), log.ERROR) #没有 apk的下载包的地方,更新该链接抓取状态为1 update_crawled_sql = sql.topicurl_withcrawed(response.url) status = handler_db(update_crawled_sql) if status['errorNo'] == 1: log.msg('this url has no apk', log.INFO) return
def get_proxy(spider_name, enable): try: proxy_sql = sql.get_proxy_url(spider_name, enable) n = handle_db(proxy_sql)['data'][0][0] print n except: #全部查询完,如果没有结果,要调增加操作,在重复调用本方法,即回调。没值会异常 #至所有代理均可用,出错在说 update_enable_sql = sql.update_proxy_enable(spider_name) n = handle_db(update_enable_sql) print n #重复调用本方法 get_proxy(spider_name, enable) return n
def get_integral_page(self, response): #正则匹配相应的积分 try: print 'get_integral_page' integral = re.findall(u'金币: <a href=".*?">(.*?)</a>'.encode('gbk', 'ignore'), response.body, re.I)[0].replace(' ', '') print 'integral', integral if integral: #如果取到相应的积分,执行判断该积分是否>20,小于20,更新数据库,跳出,大于20,更新数据库,向下执行 update_user_integral_sql = sql.update_account_point(self.username, self.platform, integral) n = handle_db(update_user_integral_sql) log.msg(('update user(%s)\'s integral is: %s, %s' % (self.username, integral, n)), log.INFO) #用户积分低于多少不能进行下载,可配置. if int(integral) > settings.INTEGERAL: request = self.get_topic(response) return request else: print 'return None' return else: log.msg('cann\'t get user\'s integral', log.ERROR) request = self.get_topic(response) return request except: log.msg(str(traceback.print_exc()), log.ERROR) request = self.get_topic(response) print 'except' return request
def parse_item(self, response): #解析出需要下载的包 topic_url = response.request.meta['topic_url'] try: urls = re.findall('<a.*?href="(.*?)".*?>.*\.[apk|zip|rar].*?</a>', response.body, re.I) print urls if urls == []: print 'this url->%s has not apk file' % response.request.meta['topic_url'] #如果没有apk文件,该主题链接失去意义,更新is_crawled=1 update_topic_url = sql.topicurl_withcrawed(topic_url) status = handle_db(update_topic_url) print status for url in set(urls): url = 'http://bbs.gfan.com/%s' % url request = response.request.replace(url=url, callback=self.get_attachementpath) request.meta['url'] = response.url yield request return except IndexError, e: traceback.print_exc() #没有 apk的下载包的地方,更新该链接抓取状态为1 update_crawled_sql = sql.topicurl_withcrawed(response.url) status = handler_db(update_crawled_sql) if status['errorNo'] == 1: print 'this url has no apk' return
def get_apk(self, response): filename = ''.join([str(random.randrange(1,100000)), '.apk']) # if os.path.exists(''.join([os.getcwd(), os.sep, 'gfan', os.sep, 'download', os.sep, time.strftime('%Y-%m-%d'),os.sep])) is False: # os.mkdir(''.join([os.getcwd(), os.sep, 'gfan', os.sep, 'download', os.sep, time.strftime('%Y-%m-%d'),os.sep])) #调用新建文件夹的方法 down_dir = utils.make_spiderdir(self.platform, 'download') #解码:有的文件可能是utf-8编码,解为unicode try: filename = filename.decode('utf-8', 'ignore') except: pass filename = ''.join([down_dir, os.sep, str(time.time()).split('.')[0], filename]) open(filename, 'wb').write(response.body) #下载后在存数据库,确保准确性 topic_url = response.request.meta['topic_url'] hashurl = sql.hash_topic(topic_url) updatedate = time.strftime('%Y-%m-%d %H:%M:%S') filename = filename.replace('\\', '\\\\') insert_sql = sql.insert_softwareinfo(self.platform, hashurl, topic_url, updatedate, filename) status = handle_db(insert_sql) log.msg(str(status), log.DEBUG) #更新topic数据库表 update_topic_url = sql.topicurl_withcrawed(topic_url) status = handle_db(update_topic_url) log.msg(str(status), log.DEBUG) #能进入本方法执行,表示已经下载了该response.积分-1 account_sql = sql.select_accountbyusername(self.username, self.platform) point_num = handle_db(account_sql)['data'][0][5] point_num -= 1 #然后执行更新 update_account_pointsql = sql.update_account_point(self.username, self.platform, point_num) n = handle_db(update_account_pointsql) if n['errorNo'] == 0: log.msg(('<username: %s \'s integral is : -1 ,now integral is %s>' % (self.username, self.reply_num)), log.INFO) try: autocopy.copy(filename, self.platform) log.msg('copy job is successed', log.INFO) except: log.msg(str(traceback.print_exc()), log.ERROR) log.msg('copy job is failture', log.ERROR) request_topic = self.repeat_reply(response) return request_topic
def reply_status(self, response): #重新返回首页,对特别是需要回复的模块特别有用,而不需要回复的帖子缺对性能是个损失. #查询回复状态 success = u'非常感谢,你的回复已经发布'.encode('gbk', 'ignore') status = re.findall(success, response.body) username_sql = sql.select_accountbyusername(self.username, self.platform) print 'status', status #回复数量限制 failture = u'对不起,您所在的用户组每小时限制发帖 '.encode('gbk', 'ignore') failture_status = re.findall(failture, response.body, re.I) if failture_status: print u'对不起,您所在的用户组每小时限制发帖 ' return if status: log.msg('reply success', log.INFO) reply_nums = handle_db(username_sql)['data'][0] self.point_num = reply_nums[5] self.reply_num = reply_nums[-2] self.reply_num += 1 self.point_num += 1 #回复成功.账号reply_num+1,积分+1 try: update_replynum = sql.update_account_username(self.username, self.platform, self.reply_num, self.point_num) update_success = handle_db(update_replynum) log.msg(('<username: %s \'s integral is : +1 ,now integral is %s>' % (self.username, self.reply_num)), log.INFO) #回复成功,执行下载 url = response.request.meta['topic_url'] print u'回复成功主题:url---->', url #回复成功,重新进入topic页 self.topic_reply_num -= 1 log.msg(('reply success, will download software%s' % str(self.topic_reply_num)), log.DEBUG) log.msg(str(response), log.DEBUG) request = self.get_down_topic(response, url) return request except Exception, e: log.msg(str(traceback.print_exc()), log.ERROR) #重调get_topic.同时REPLY_NUM-1,当REPLY_NUM<1时,不在做任何事情. self.topic_reply_num -= 1 log.msg(('reply success, will download software', str(self.topic_reply_num)), log.DEBUG) log.msg(str(response), log.DEBUG) request_topic = self.repeat_reply(response) return request_topic
def get_apk(self, response): filename = response.request.meta['filename'] if os.path.exists(''.join([os.getcwd(), os.sep, 'gfan', os.sep, 'download', os.sep, time.strftime('%Y-%m-%d'),os.sep])) is False: os.mkdir(''.join([os.getcwd(), os.sep, 'gfan', os.sep, 'download', os.sep, time.strftime('%Y-%m-%d'),os.sep])) filename = ''.join([os.getcwd(), os.sep, 'gfan', os.sep, 'download', os.sep, time.strftime('%Y-%m-%d'), os.sep, str(time.time()).split('.')[0], filename]) open(filename, 'wb').write(response.body) #下载后在存数据库,确保准确性 topic_url = response.request.meta['topic_url'] hashurl = sql.hash_topic(topic_url) updatedate = time.strftime('%Y-%m-%d %H:%M:%S') filename = filename.replace('\\', '\\\\') insert_sql = sql.insert_softwareinfo(self.platform, hashurl, topic_url, updatedate, filename) status = handle_db(insert_sql) print status #更新topic数据库表 update_topic_url = sql.topicurl_withcrawed(topic_url) status = handle_db(update_topic_url) print status
def __init__(self): account_message = get_account(self.platform) self.username = account_message[2] self.password = account_message[3] self.reply_num = 0 #爬虫启动前,执行更新用户使用状态的为1.不准其他线程使用该账号. update_avail_sql = sql.update_use_byusernamesql(self.username, self.platform, '1') n = handle_db(update_avail_sql) log.msg(('<username: %s > is being use' % self.username), log.DEBUG) #本对象一旦接手到爬虫结束的信号,调用第一个参数这个方法 dispatcher.connect(self.user_relax, signals.spider_closed)
def process_item(self, item, spider): #对传递过来的item进行解析 topic_url = item['topic_url'] referer_url = item['referer_url'] spider_name = item['spider_name'] update_date = time.strftime('%Y-%m-%d %H:%M:%S') is_crawled = 0 priority_rating = 0 sql = insert_topic_url(spider_name, topic_url,referer_url, update_date, is_crawled, priority_rating) n = handle_db(sql) print n return item
def parse_item(self, response): #解析出需要下载的包 log.msg(response.url, log.INFO) try: if re.findall(u'抱歉,指定的主题不存在或已被删除或正在被审核'.encode('gbk', 'ignore'), response.body, re.I): #执行更新topic操作 log.msg(('this topic_url(%s) has be removed by admin' % response.url), log.INFO) update_topic_url = sql.topicurl_withcrawed(topic_url) status = handle_db(update_topic_url) return topic_url = response.request.meta['topic_url'] except: log.msg(str(traceback.print_exc()), log.INFO) try: urls = re.findall('<a href="(forum.php?[^"]+)".*>.*?\.apk', response.body, re.I) if urls == []: #没有回复不能下载 log.msg(('this url->%s has not apk file' % response.request.meta['topic_url']), log.INFO) #如果没有apk文件,该主题链接失去意义,更新is_crawled=1 update_topic_url = sql.topicurl_withcrawed(topic_url) status = handle_db(update_topic_url) request_topic = self.repeat_reply(response) yield request_topic else: for url in set(urls): url = 'http://bbs.mumayi.com/%s' % url url = url.replace('amp;', '') request = response.request.replace(url=url, method='get', callback=self.get_apk) request.meta['url'] = response.url yield request except IndexError, e: log.msg(str(traceback.print_exc()), log.ERROR) #没有 apk的下载包的地方,更新该链接抓取状态为1 update_crawled_sql = sql.topicurl_withcrawed(response.url) status = handler_db(update_crawled_sql) if status['errorNo'] == 1: log.msg('this url has no apk', log.INFO) return
def get_topic(self, response): #根据时间,取为下载的部分进行回复 topic_urls = sql.get_topic_ranone(self.platform, '0') url = handle_db(topic_urls)['data'][0] request = response.request.replace(url = url[0], method='get') request.callback = self.get_code #request.meta['crawled'] = url[1] request.meta['topic_url'] = url[0] #将登陆后这个response对象存起来,方便后面回调本方法,在传入这个对象 request.meta['topic_response'] = response return request
def reply_status(self, response): #重新返回首页,对特别是需要回复的模块特别有用,而不需要回复的帖子缺对性能是个损失. #查询回复状态 success = '非常感谢,您的回复已经发布' status = re.findall(success, response.body) username_sql = sql.select_accountbyusername(self.username, self.platform) if status: log.msg('reply success', log.INFO) reply_nums = handle_db(username_sql)['data'][0] self.point_num = reply_nums[5] self.reply_num = reply_nums[-2] self.reply_num += 1 self.point_num += 1 #回复成功.账号reply_num+1,积分+1 try: update_replynum = sql.update_account_username(self.username, self.platform, self.reply_num, self.point_num) update_success = handle_db(update_replynum) log.msg(('<username: %s \'s integral is : +1 ,now integral is %s>' % (self.username, self.reply_num)), log.INFO) except Exception, e: log.msg(str(traceback.print_exc()), log.ERROR)
def parse_item(self, response): #解析出需要下载的包 log.msg(response.url, log.INFO) try: topic_url = response.request.meta['topic_url'] if re.findall('指定的主题不存在或已被删除或正在被审核', response.body, re.I): #执行更新topic操作 log.msg(('this topic_url(%s) has be removed by admin' % response.url), log.INFO) update_topic_url = sql.topicurl_withcrawed(topic_url) status = handle_db(update_topic_url) return topic_url = response.request.meta['topic_url'] except: log.msg(str(traceback.print_exc()), log.INFO) try: urls = re.findall('<a href="(attachment.php?[^"]+)".*?>.*\.apk', response.body, re.I) if urls == []: log.msg(('this url->%s has not apk file' % response.request.meta['topic_url']), log.INFO) #如果没有apk文件,该主题链接失去意义,更新is_crawled=1 update_topic_url = sql.topicurl_withcrawed(topic_url) status = handle_db(update_topic_url) return else: for url in set(urls): url = 'http://bbs.gfan.com/%s' % url request = response.request.replace(url=url, callback=self.get_attachementpath) request.meta['url'] = response.url yield request except IndexError, e: log.msg(str(traceback.print_exc()), log.ERROR) #没有 apk的下载包的地方,更新该链接抓取状态为1 update_crawled_sql = sql.topicurl_withcrawed(response.url) status = handler_db(update_crawled_sql) if status['errorNo'] == 1: log.msg('this url has no apk', log.INFO) return
def get_apk(self, response): filename = ''.join([str(random.randrange(1,100000)), '.ipa']) # if os.path.exists(''.join([os.getcwd(), os.sep, 'hiapk', os.sep, 'download', os.sep, time.strftime('%Y-%m-%d'),os.sep])) is False: # os.mkdir(''.join([os.getcwd(), os.sep, 'hiapk', os.sep, 'download', os.sep, time.strftime('%Y-%m-%d'),os.sep])) #调用新建文件夹的方法 down_dir = utils.make_spiderdir(self.platform, 'download') #解码:有的文件可能是utf-8编码,解为unicode try: filename = filename.decode('utf-8', 'ignore') except: pass filename = ''.join([down_dir, os.sep, str(time.time()).split('.')[0], filename]) #下载后在存数据库,确保准确性 topic_url = response.request.url hashurl = sql.hash_topic(topic_url) updatedate = time.strftime('%Y-%m-%d %H:%M:%S') filename = filename.replace('\\', '\\\\') #插入数据库主题页面 insert_topic_sql = sql.insert_topic_url(self.name, topic_url, '', updatedate, '1', '0') topic_status = handle_db(insert_topic_sql) if topic_status['errorNo'] == -1: raise ValueError("this ipa file has download in my databases") open(filename, 'wb').write(response.body) insert_sql = sql.insert_softwareinfo(self.platform, hashurl, topic_url, updatedate, filename) status = handle_db(insert_sql) log.msg(str(status), log.DEBUG) try: autocopy.copy(filename, self.platform) log.msg('copy job is successed', log.INFO) except: log.msg(str(traceback.print_exc()), log.ERROR) log.msg('copy job is failture', log.ERROR)
def reply_status(self, response): #重新返回首页,对特别是需要回复的模块特别有用,而不需要回复的帖子缺对性能是个损失. #查询回复状态 success = '非常感谢,您的回复已经发布' status = re.findall(success, response.body) print self.username username_sql = sql.select_accountbyusername(self.username, self.platform) if status: print 'reply success' reply_nums = handle_db(username_sql)['data'][0] self.reply_num = reply_nums[-2] self.reply_num += 1 #回复成功.账号reply_num+1 try: now_time = time.strftime('%Y-%m-%d %H:%M:%S') update_replynum = sql.update_account_username(self.username, now_time, self.platform, self.reply_num) update_success = handle_db(update_replynum) print update_success print 'update success! user\'s reply_num + 1' except Exception, e: traceback.print_exc()
def get_topic(self, response): #随机选取一个下载地址,进行回复下载. print 'get_topic--------->hello world' topic_urls = sql.get_topic_bycrawed(self.platform, '0') urls = handle_db(topic_urls)['data'] for url in urls: print 'len(urls)---------->',url request = response.request.replace(url = url[0], method='get', meta={'proxy' : 'http://119.115.136.226:443'}) request.callback = self.get_code request.meta['crawled'] = url[1] request.meta['topic_url'] = url[0] #request.meta['proxy'] = self.proxy_url yield request
def get_downloadpath(self, response): topic_url = response.request.meta['topic_url'] log.msg(('get_downloadpath,topic_url', response.request.meta['topic_url']), log.INFO) try: #获取下载地址的前一个地址是response.url url = re.findall(u'<a href="(http://bbs\.goapk\.com/forum\.php\?mod=attachment.*aid=.*)">', response.body, re.I)[0] #文件名,zanshi url = url.replace('amp;', '') request = response.request.replace(url=url, callback=self.get_apk) return request except: traceback.print_exc() #这里发生异常,表示抓取不到该文件名吗? update_topic_url = sql.topicurl_withcrawed(topic_url) status = handle_db(update_topic_url) print status
def process_item(self, item, spider): #对传递过来的item进行解析 log.msg(str(item), log.DEBUG) try: topic_url = item['topic_url'] referer_url = item['referer_url'] spider_name = item['spider_name'] update_date = time.strftime('%Y-%m-%d %H:%M:%S') is_crawled = 0 priority_rating = 5 sql = insert_topic_url(spider_name, topic_url,referer_url, update_date, is_crawled, priority_rating) n = handle_db(sql) except: log.msg(str(traceback.print_exc()), log.DEBUG) log.msg('insert a message' + str(n), log.DEBUG) return item
def __init__(self): #取用户 account_message = get_account(self.platform) self.username = account_message[2] self.password = account_message[3] self.reply_num = 0 #爬虫启动前,执行更新用户使用状态的的方法 update_avail_sql = sql.update_use_byusernamesql(self.username, self.platform, '1') n = handle_db(update_avail_sql) log.msg(('<username : %s > is being use' % self.username), log.DEBUG) #从settings.py文件中取得要回复帖子的数量 self.topic_reply_num = settings.REPLY_NUM #本对象一旦接手到爬虫结束的信号,调用第一个参数这个方法 dispatcher.connect(self.user_relax, signals.spider_closed)
def get_code(self, response): #网站管理员对主题页内容进行了删除操作 if re.findall('指定的主题不存在或已被删除或正在被审核', response.body, re.I): #执行更新topic操作 print 'this topic_url(%s) has be removed by admin' % response.url update_topic_url = sql.topicurl_withcrawed(topic_url) status = handle_db(update_topic_url) print status return print 'response.url',response.url request_code = response.request.replace(url = 'http://bbs.gfan.com/ajax.php?action=newupdatesecqaa&inajax=0&ajaxtarget=secanswer_menu_content',\ callback=self.ay_code) request_code.meta['response'] = response #request_code.meta['proxy'] = self.proxy_url response.request.meta['response'] = response yield request_code
def get_downloadpath(self, response): print 'get_downloadpath,topic_url', response.request.meta['topic_url'] try: #获取下载地址的前一个地址是response.url url = re.findall('<p class="alert_btnleft"><a href="(.*?)">', response.body, re.I)[0] url = 'http://bbs.gfan.com/%s' % url #文件名,zanshi file_name = re.findall('<div class="alert_info">\s+<p>.*?“(.*?)”', response.body, re.I)[0] request = response.request.replace(url=url, callback=self.get_apk) request.meta['filename'] = file_name return request except: traceback.print_exc() #这里发生异常,表示抓取不到该文件名吗? update_topic_url = sql.topicurl_withcrawed(topic_url) status = handle_db(update_topic_url) print status
def get_downloadpath(self, response): topic_url = response.request.meta['topic_url'] log.msg(('get_downloadpath,topic_url', response.request.meta['topic_url']), log.INFO) try: #获取下载地址的前一个地址是response.url url = re.findall('<a href="(attachment.php?[^"]+)".*?>', response.body, re.I)[0] url = 'http://bbs.gfan.com/%s' % url #文件名,zanshi file_name = re.findall('“.*\.(.*?)”', response.body, re.I)[0] request = response.request.replace(url=url, callback=self.get_apk) request.meta['filename'] = file_name return request except: traceback.print_exc() #这里发生异常,表示抓取不到该文件名吗? update_topic_url = sql.topicurl_withcrawed(topic_url) status = handle_db(update_topic_url) print status
def get_code(self, response): if self.topic_reply_num < 1: return time.sleep(20) #网站管理员对主题页内容进行了删除操作 if re.findall('指定的主题不存在或已被删除或正在被审核'.decode('utf-8', 'ignore').encode('utf-8', 'ignore'), response.body, re.I): #执行更新topic操作 log.msg(('this topic_url(%s) has be removed by admin' % response.url), log.INFO) update_topic_url = sql.topicurl_withcrawed(topic_url) status = handle_db(update_topic_url) #回调本方法 request_topic = self.repeat_reply(response) return request_topic message = self.get_othermessage(response) if len(message) < 10: message = ''.join([re.sub('<.*?>','', message), '..........'] ) try: posttime = str(time.time()).split('.')[0] hash = getFormHash(response.body) formdata = {"message" : message,'posttime' : posttime, 'formhash' : hash, 'subject' : ''} formrequest = FormRequest.from_response(response=response, formnumber=1,formdata = formdata,dont_click=True,) formrequest.callback = self.reply_status print 'response.url---->',response.url #url = self.get_reply_url(response) #if url: # formrequest._url = url #else: # self.repeat_reply(response) #formrequest._url = '%s%s' % (formrequest.url, '&inajax=1') formrequest.meta['topic_url'] = response.request.meta['topic_url'] formrequest.meta['topic_response'] = response.request.meta['topic_response'] #formrequest.meta['proxy'] = self.proxy_url return [formrequest] except: #发生无法回复的异常如何处理?比如无form,发生IndexError错误.回调get_topic request_topic = self.reply_status(response) return [request_topic]
def get_integral_page(self, response): #正则匹配相应的积分 try: integral = re.findall(u'<li><em>金币</em>(.*?)</li>'.encode('gbk', 'ignore'), response.body, re.I)[0].replace(' ', '') if integral: #如果取到相应的积分,执行判断该积分是否>20,小于20,更新数据库,跳出,大于20,更新数据库,向下执行 update_user_integral_sql = sql.update_account_point(self.username, self.platform, integral) n = handle_db(update_user_integral_sql) log.msg(('update user(%s)\'s integral is: %s, %s' % (self.username, integral, n)), log.INFO) #回复加下载模块并行,不在使用积分低于多少不能下载机制。因为这里会导致不在回复 。 request = self.get_topic(response) return request else: log.msg('cann\'t get user\'s integral', log.ERROR) request = self.get_topic(response) return request except: log.msg(str(traceback.print_exc()), log.ERROR) request = self.get_topic(response) return request
def get_code(self, response): if self.topic_reply_num < 1: return time.sleep(20) #网站管理员对主题页内容进行了删除操作 if re.findall('指定的主题不存在或已被删除或正在被审核', response.body, re.I): #执行更新topic操作 topic_url = response.request.meta['topic_url'] log.msg(('this topic_url(%s) has be removed by admin' % response.url), log.INFO) update_topic_url = sql.topicurl_withcrawed(topic_url) status = handle_db(update_topic_url) #回调本方法 request_topic = self.repeat_reply(response) return request_topic log.msg(('response.url',response.url), log.INFO) request_code = response.request.replace(url = 'http://bbs.gfan.com/ajax.php?action=newupdatesecqaa&inajax=0&ajaxtarget=secanswer_menu_content',\ callback=self.ay_code) request_code.meta['response'] = response #request_code.meta['proxy'] = self.proxy_url #response.request.meta['response'] = response return request_code
def get_downloadurl(self, response): ''' 将主题获取的内容进行apk正则表达式解析, 如果根据下载地址去下载页面的化,现在不在可控范围,即会下很多无用包 ''' try: down_urls = re.findall('<a href="(.*?)">.*\.(apk)</a>', response.body) #含有下载附近的页面,插入topicdata表中,插入成功,表示该页面未抓取过,同时真实下载后更新之 if any(down_urls): try: update_time = time.strftime('%Y-%m-%d %H:%M:%S') insert_topic_sql = sql.insert_topic_url(self.name, response.url, response.request.url, update_time, '0', '0') log.msg(insert_topic_sql, log.INFO) status = db.handle_db(insert_topic_sql) if status['errorNo'] == 0: log.msg('this url can be use', log.INFO) else: log.msg('this url cann\'t be used', log.INFO) #return的关键作用是已经抓取过的主题页面不在抓取,保证不重复抓取软件包. return except: log.msg(str(traceback.print_exc()), log.INFO) #预计发生唯一索引错误。即数据库中已经有该数据了 return for url in down_urls: print url filename = url[1] url = '%s%s' % ('http://forum.xda-developers.com/', url[0]) #生成的url先放在topicdata表中,在每次重复抓一个主题页时, request = Request(url = url, callback = self.get_apk) #将topic_url存入到meta属性中 request._meta = {'filename' : filename, 'topic_url' : response.url} yield request except: log.msg(str(traceback.print_exc()), log.INFO) return