예제 #1
0
    def get_apk(self, response):
        topic_url = referer_url = response.request.meta['referer_url']
        update_date = time.strftime('%Y-m-%d %H:%M:%S')
        is_crawled = '1'
        priority_rating = '0'
        filename = ''.join([str(random.randrange(1,100000)), '.apk'])
#        if os.path.exists(''.join([os.getcwd(), os.sep, 'gfan', os.sep, 'download', os.sep, time.strftime('%Y-%m-%d'),os.sep])) is False:
#            os.mkdir(''.join([os.getcwd(), os.sep, 'gfan', os.sep, 'download', os.sep, time.strftime('%Y-%m-%d'),os.sep]))
        #调用新建文件夹的方法
        down_dir = utils.make_spiderdir(self.platform, 'download')
        #解码:有的文件可能是utf-8编码,解为unicode
        try:
            filename = filename.decode('utf-8', 'ignore')
        except:
            pass
        filename = ''.join([down_dir, os.sep, str(time.time()).split('.')[0], filename])
        open(filename, 'wb').write(response.body)
        #下载后在存数据库,确保准确性
        hashurl = sql.hash_topic(topic_url)
        updatedate = time.strftime('%Y-%m-%d %H:%M:%S')
        filename = filename.replace('\\', '\\\\')
        insert_sql = sql.insert_softwareinfo(self.platform, hashurl, topic_url, updatedate, filename)
        status = handle_db(insert_sql)
        log.msg(str(status), log.DEBUG) 
        #更新topic数据库表
        insert_sql = sql.insert_topic_url(self.platform, topic_url,referer_url, updatedate, is_crawled, priority_rating)
        statusn = handle_db(insert_sql)
        log.msg(str(status), log.DEBUG)

        try:
            autocopy.copy(filename, self.platform)
            log.msg('copy job is successed', log.INFO)
        except:
            log.msg(str(traceback.print_exc()), log.ERROR)
            log.msg('copy job is failture', log.ERROR) 
예제 #2
0
 def reply_status(self, response):
     #重新返回首页,对特别是需要回复的模块特别有用,而不需要回复的帖子缺对性能是个损失.
     #查询回复状态
     
     success = u'非常感谢,你的回复已经发布'.encode('gbk', 'ignore')
     status = re.findall(success, response.body)
     username_sql = sql.select_accountbyusername(self.username, self.platform)
     
     #回复数量限制
     failture = u'对不起,您所在的用户组每小时限制发帖 '.encode('gbk', 'ignore')
     failture_status = re.findall(failture, response.body, re.I)
     if failture_status:
         return
     if status:
         log.msg('reply success', log.INFO) 
         reply_nums = handle_db(username_sql)['data'][0]
         self.point_num = reply_nums[5]
         self.reply_num = reply_nums[-2]
         self.reply_num += 1
         self.point_num += 1
         #回复成功.账号reply_num+1,积分+1
         try:
             update_replynum = sql.update_account_username(self.username, self.platform, self.reply_num, self.point_num)
             update_success = handle_db(update_replynum)
             log.msg(('<username: %s \'s integral is : +1 ,now integral is %s>' % (self.username, self.reply_num)), log.INFO)
         except Exception, e:
             log.msg(str(traceback.print_exc()), log.ERROR)
예제 #3
0
    def get_apk(self, response):
        '''
                        将下载地址下载的包写入文件,并且
        '''
        filename = response.request.meta['filename']
        filename = ''.join([str(random.randrange(1,100000)), '.', filename])
#        if os.path.exists(''.join([os.getcwd(), os.sep, 'xda', os.sep, 'download', os.sep, time.strftime('%Y-%m-%d'),os.sep])) is False:
#            os.mkdir(''.join([os.getcwd(), os.sep, 'xda', os.sep, 'download', os.sep, time.strftime('%Y-%m-%d'),os.sep]))
        #调用新建文件夹的方法
        down_dir = utils.make_spiderdir(self.name, 'download')
        #解码:有的文件可能是utf-8编码,解为unicode
        try:
            filename = filename.decode('utf-8', 'ignore')
        except:
            pass
        filename = ''.join([down_dir, os.sep, str(time.time()).split('.')[0], filename])
        open(filename, 'wb').write(response.body)
        #下载后在存数据库,确保准确性
        topic_url = response.request.meta['topic_url']
        hashurl = sql.hash_topic(topic_url)
        updatedate = time.strftime('%Y-%m-%d %H:%M:%S')
        filename = filename.replace('\\', '\\\\')
        insert_sql = sql.insert_softwareinfo(self.name, hashurl, topic_url, updatedate, filename)
        status = db.handle_db(insert_sql)
        log.msg(str(status), log.DEBUG) 
        #更新topic数据库表
        update_topic_url = sql.topicurl_withcrawed(topic_url)
        status = db.handle_db(update_topic_url)
        log.msg(str(status), log.DEBUG)
        #备份目录
        try:
            autocopy.copy(filename, self.name)
            log.msg('copy job is successed', log.INFO)
        except:
            log.msg('copy job is failture', log.ERROR)
예제 #4
0
 def parse_item(self, response):
     #解析出需要下载的包
     log.msg(response.url, log.INFO)
     try:
         topic_url = response.request.meta['topic_url']
         if re.findall(u'指定的主题不存在或已被删除或正在被审核,请返回'.encode('gbk', 'ignore'), response.body, re.I):
             #执行更新topic操作
             log.msg(('this topic_url(%s) has be removed by admin' % response.url), log.INFO) 
             update_topic_url = sql.topicurl_withcrawed(topic_url)
             status = handle_db(update_topic_url)
             return
         topic_url = response.request.meta['topic_url']
     except:
         log.msg(str(traceback.print_exc()), log.INFO)
     try:
         urls = re.findall('<a href="(.*?)".*\.apk</a>', response.body, re.I)
         if urls == []:
             #必须先回复在下载的数据
             reply_urls = re.findall(u'如果你要查看本帖隐藏内容请'.encode('gbk', 'ignore'), response.body, re.I)
             #判断是否有匹配必须回复的,有就更新抓取级别为-1,单独使用回复加下载一体模块查找级别为-1的模块
             print 'reply_urls'
             print reply_urls
             if reply_urls != []:
                 update_topic_priority_sql = sql.update_topic_priority(self.platform, topic_url, '-1')
                 n = handle_db(update_topic_priority_sql)
                 log.msg(str(n), log.INFO)
                 return
             log.msg(('this url->%s has not apk file' % response.request.meta['topic_url']), log.INFO) 
             #如果没有apk文件,该主题链接失去意义,更新is_crawled=1
             update_topic_url = sql.topicurl_withcrawed(topic_url)
             status = handle_db(update_topic_url)
             #回复成功后,没有下载链接,本次登陆错操作不能中止,继续执行
             self.topic_reply_num -= 1
             log.msg(('reply success, will download software%s' % str(self.topic_reply_num)), log.DEBUG)
             log.msg(str(response), log.DEBUG)
             request = self.get_down_topic(response, url)
             yield request
         else:
             for url in set(urls):
                 url = url.replace('amp;', '')
                 print 'url:', url
                 request =  response.request.replace(url=url, callback=self.get_downloadpath)
                 request.meta['url'] = response.url
                 yield request
         
     except IndexError, e:
         log.msg(str(traceback.print_exc()), log.ERROR)
         #没有 apk的下载包的地方,更新该链接抓取状态为1
         update_crawled_sql = sql.topicurl_withcrawed(response.url)
         status = handler_db(update_crawled_sql)
         if status['errorNo'] == 1:
             log.msg('this url has no apk', log.INFO) 
         return
예제 #5
0
 def parse_item(self, response):
     #解析出需要下载的包
     log.msg(response.url, log.INFO)
     try:
         if re.findall(u'抱歉,指定的主题不存在或已被删除或正在被审核'.encode('gbk', 'ignore'), response.body, re.I):
             #执行更新topic操作
             log.msg(('this topic_url(%s) has be removed by admin' % response.url), log.INFO) 
             update_topic_url = sql.topicurl_withcrawed(topic_url)
             status = handle_db(update_topic_url)
             
             return
         topic_url = response.request.meta['topic_url']
     except:
         log.msg(str(traceback.print_exc()), log.INFO)
     try:
         urls = re.findall('<a href="(forum.php?[^"]+)" target="_blank">.*?\.apk</a>', response.body, re.I)
         if urls == []:
             #没有回复不能下载
             noreply_regex = u'如果您要查看本帖隐藏内容请'.encode('gbk', 'ignore')
             noreply = re.findall(noreply_regex, response.body, re.I)
             for i in noreply:
                 print i.decode('gbk','ignore')
             if noreply != []:
                 #需要执行更新topic方法,avail字段为-1
                 update_topic_priority_sql = sql.update_topic_priority(self.platform, topic_url, '-1')
                 n = handle_db(update_topic_priority_sql)
                 log.msg(''.join(['hide_apk_update topic_url priority=-1', str(n)]), log.INFO)
                 return
             else:
                 log.msg(('this url->%s has not apk file' % response.request.meta['topic_url']), log.INFO) 
                 #如果没有apk文件,该主题链接失去意义,更新is_crawled=1
                 update_topic_url = sql.topicurl_withcrawed(topic_url)
                 status = handle_db(update_topic_url)
                 return
         else:
             for url in set(urls):
                 url = 'http://bbs.mumayi.com/%s' % url
                 request =  response.request.replace(url=url, callback=self.get_apk)
                 request.meta['url'] = response.url
                 yield request
         
     except IndexError, e:
         log.msg(str(traceback.print_exc()), log.ERROR)
         #没有 apk的下载包的地方,更新该链接抓取状态为1
         update_crawled_sql = sql.topicurl_withcrawed(response.url)
         status = handler_db(update_crawled_sql)
         if status['errorNo'] == 1:
             log.msg('this url has no apk', log.INFO) 
         return
예제 #6
0
def get_proxy(spider_name, enable):
    try:
        proxy_sql = sql.get_proxy_url(spider_name, enable)
        n = handle_db(proxy_sql)['data'][0][0]
        
        print n
    except:
        #全部查询完,如果没有结果,要调增加操作,在重复调用本方法,即回调。没值会异常
        #至所有代理均可用,出错在说
        update_enable_sql = sql.update_proxy_enable(spider_name)
        n = handle_db(update_enable_sql)
        print n
        #重复调用本方法
        get_proxy(spider_name, enable)
    return n
예제 #7
0
 def get_integral_page(self, response):
     #正则匹配相应的积分
     try:
         print 'get_integral_page'
         integral = re.findall(u'金币: <a href=".*?">(.*?)</a>'.encode('gbk', 'ignore'), response.body, re.I)[0].replace(' ', '')
         print 'integral', integral
         if integral:
             #如果取到相应的积分,执行判断该积分是否>20,小于20,更新数据库,跳出,大于20,更新数据库,向下执行
             update_user_integral_sql = sql.update_account_point(self.username, self.platform, integral)
             n = handle_db(update_user_integral_sql)
             log.msg(('update user(%s)\'s integral is: %s, %s' % (self.username, integral, n)), log.INFO)
             #用户积分低于多少不能进行下载,可配置.
             if int(integral) > settings.INTEGERAL:
                 request = self.get_topic(response)
                 return request
             else:
                 print 'return None'
                 return
         else:
             log.msg('cann\'t get user\'s integral', log.ERROR)
             request = self.get_topic(response)
             return request
     except:
         log.msg(str(traceback.print_exc()), log.ERROR)
         request = self.get_topic(response)
         print 'except'
         return request
예제 #8
0
 def parse_item(self, response):
     #解析出需要下载的包
     topic_url = response.request.meta['topic_url']
     try:
         urls = re.findall('<a.*?href="(.*?)".*?>.*\.[apk|zip|rar].*?</a>', response.body, re.I)
         print urls
         if urls == []:
             print 'this url->%s has not apk file' % response.request.meta['topic_url']
             #如果没有apk文件,该主题链接失去意义,更新is_crawled=1
             update_topic_url = sql.topicurl_withcrawed(topic_url)
             status = handle_db(update_topic_url)
             print status
         for url in set(urls):
             url = 'http://bbs.gfan.com/%s' % url
             request =  response.request.replace(url=url, callback=self.get_attachementpath)
             request.meta['url'] = response.url
             yield request
         return
     except IndexError, e:
         traceback.print_exc()
         #没有 apk的下载包的地方,更新该链接抓取状态为1
         update_crawled_sql = sql.topicurl_withcrawed(response.url)
         status = handler_db(update_crawled_sql)
         if status['errorNo'] == 1:
             print 'this url has no apk'
         return
예제 #9
0
    def get_apk(self, response):

        filename = ''.join([str(random.randrange(1,100000)), '.apk'])
#        if os.path.exists(''.join([os.getcwd(), os.sep, 'gfan', os.sep, 'download', os.sep, time.strftime('%Y-%m-%d'),os.sep])) is False:
#            os.mkdir(''.join([os.getcwd(), os.sep, 'gfan', os.sep, 'download', os.sep, time.strftime('%Y-%m-%d'),os.sep]))
        #调用新建文件夹的方法
        down_dir = utils.make_spiderdir(self.platform, 'download')
        #解码:有的文件可能是utf-8编码,解为unicode
        try:
            filename = filename.decode('utf-8', 'ignore')
        except:
            pass
        filename = ''.join([down_dir, os.sep, str(time.time()).split('.')[0], filename])
        open(filename, 'wb').write(response.body)
        #下载后在存数据库,确保准确性
        topic_url = response.request.meta['topic_url']
        hashurl = sql.hash_topic(topic_url)
        updatedate = time.strftime('%Y-%m-%d %H:%M:%S')
        filename = filename.replace('\\', '\\\\')
        insert_sql = sql.insert_softwareinfo(self.platform, hashurl, topic_url, updatedate, filename)
        status = handle_db(insert_sql)
        log.msg(str(status), log.DEBUG) 
        #更新topic数据库表
        update_topic_url = sql.topicurl_withcrawed(topic_url)
        status = handle_db(update_topic_url)
        log.msg(str(status), log.DEBUG)

        #能进入本方法执行,表示已经下载了该response.积分-1
        account_sql = sql.select_accountbyusername(self.username, self.platform)
        point_num = handle_db(account_sql)['data'][0][5]
        point_num -= 1
        #然后执行更新
        update_account_pointsql = sql.update_account_point(self.username, self.platform, point_num)
        n = handle_db(update_account_pointsql)
        if n['errorNo'] == 0:
            log.msg(('<username: %s \'s integral is : -1 ,now integral is %s>' % (self.username, self.reply_num)), log.INFO)
        try:
            autocopy.copy(filename, self.platform)
            log.msg('copy job is successed', log.INFO)
        except:
            log.msg(str(traceback.print_exc()), log.ERROR)
            log.msg('copy job is failture', log.ERROR)
        
        request_topic = self.repeat_reply(response)
        return request_topic
예제 #10
0
 def reply_status(self, response):
     #重新返回首页,对特别是需要回复的模块特别有用,而不需要回复的帖子缺对性能是个损失.
     #查询回复状态
     
     success = u'非常感谢,你的回复已经发布'.encode('gbk', 'ignore')
     status = re.findall(success, response.body)
     username_sql = sql.select_accountbyusername(self.username, self.platform)
     print 'status', status
             #回复数量限制
     failture = u'对不起,您所在的用户组每小时限制发帖 '.encode('gbk', 'ignore')
     failture_status = re.findall(failture, response.body, re.I)
     if failture_status:
         print u'对不起,您所在的用户组每小时限制发帖 '
         return
     
     if status:
         log.msg('reply success', log.INFO) 
         reply_nums = handle_db(username_sql)['data'][0]
         self.point_num = reply_nums[5]
         self.reply_num = reply_nums[-2]
         self.reply_num += 1
         self.point_num += 1
         #回复成功.账号reply_num+1,积分+1
         try:
             update_replynum = sql.update_account_username(self.username, self.platform, self.reply_num, self.point_num)
             update_success = handle_db(update_replynum)
             log.msg(('<username: %s \'s integral is : +1 ,now integral is %s>' % (self.username, self.reply_num)), log.INFO)
             #回复成功,执行下载
             url = response.request.meta['topic_url']
             print u'回复成功主题:url---->', url
             #回复成功,重新进入topic页
             self.topic_reply_num -= 1
             log.msg(('reply success, will download software%s' % str(self.topic_reply_num)), log.DEBUG)
             log.msg(str(response), log.DEBUG)
             request = self.get_down_topic(response, url)
             return request
         except Exception, e:
             log.msg(str(traceback.print_exc()), log.ERROR)
             #重调get_topic.同时REPLY_NUM-1,当REPLY_NUM<1时,不在做任何事情.
             self.topic_reply_num -= 1
             log.msg(('reply success, will download software', str(self.topic_reply_num)), log.DEBUG)
             log.msg(str(response), log.DEBUG)
             request_topic = self.repeat_reply(response)
             return request_topic
예제 #11
0
 def get_apk(self, response):
     filename = response.request.meta['filename']
     
     if os.path.exists(''.join([os.getcwd(), os.sep, 'gfan', os.sep, 'download', os.sep, time.strftime('%Y-%m-%d'),os.sep])) is False:
         os.mkdir(''.join([os.getcwd(), os.sep, 'gfan', os.sep, 'download', os.sep, time.strftime('%Y-%m-%d'),os.sep]))
     filename = ''.join([os.getcwd(), os.sep, 'gfan', os.sep, 'download', os.sep, time.strftime('%Y-%m-%d'), os.sep, str(time.time()).split('.')[0], filename])
     open(filename, 'wb').write(response.body)
     #下载后在存数据库,确保准确性
     topic_url = response.request.meta['topic_url']
     hashurl = sql.hash_topic(topic_url)
     updatedate = time.strftime('%Y-%m-%d %H:%M:%S')
     filename = filename.replace('\\', '\\\\')
     insert_sql = sql.insert_softwareinfo(self.platform, hashurl, topic_url, updatedate, filename)
     status = handle_db(insert_sql)
     print status 
     #更新topic数据库表
     update_topic_url = sql.topicurl_withcrawed(topic_url)
     status = handle_db(update_topic_url)
     print status
예제 #12
0
 def __init__(self):
     account_message = get_account(self.platform)
     self.username =  account_message[2]
     self.password = account_message[3]        
     self.reply_num = 0
     #爬虫启动前,执行更新用户使用状态的为1.不准其他线程使用该账号.
     update_avail_sql = sql.update_use_byusernamesql(self.username, self.platform, '1')
     n = handle_db(update_avail_sql)
     log.msg(('<username: %s > is being use' % self.username), log.DEBUG)
     #本对象一旦接手到爬虫结束的信号,调用第一个参数这个方法
     dispatcher.connect(self.user_relax, signals.spider_closed)
예제 #13
0
 def process_item(self, item, spider):
     #对传递过来的item进行解析
     topic_url = item['topic_url']
     referer_url = item['referer_url']
     spider_name = item['spider_name']
     update_date = time.strftime('%Y-%m-%d %H:%M:%S')
     is_crawled = 0
     priority_rating = 0
     sql = insert_topic_url(spider_name, topic_url,referer_url, update_date, is_crawled, priority_rating)
     n = handle_db(sql)
     print n
     return item
예제 #14
0
 def parse_item(self, response):
     #解析出需要下载的包
     log.msg(response.url, log.INFO)
     try:
         if re.findall(u'抱歉,指定的主题不存在或已被删除或正在被审核'.encode('gbk', 'ignore'), response.body, re.I):
             #执行更新topic操作
             log.msg(('this topic_url(%s) has be removed by admin' % response.url), log.INFO) 
             update_topic_url = sql.topicurl_withcrawed(topic_url)
             status = handle_db(update_topic_url)
             
             return
         topic_url = response.request.meta['topic_url']
     except:
         log.msg(str(traceback.print_exc()), log.INFO)
     try:
         urls = re.findall('<a href="(forum.php?[^"]+)".*>.*?\.apk', response.body, re.I)
         if urls == []:
             #没有回复不能下载
             log.msg(('this url->%s has not apk file' % response.request.meta['topic_url']), log.INFO) 
             #如果没有apk文件,该主题链接失去意义,更新is_crawled=1
             update_topic_url = sql.topicurl_withcrawed(topic_url)
             status = handle_db(update_topic_url)
             request_topic = self.repeat_reply(response)
             yield request_topic
         else:
             for url in set(urls):
                 url = 'http://bbs.mumayi.com/%s' % url
                 url = url.replace('amp;', '')
                 request =  response.request.replace(url=url, method='get', callback=self.get_apk)
                 request.meta['url'] = response.url
                 yield request
         
     except IndexError, e:
         log.msg(str(traceback.print_exc()), log.ERROR)
         #没有 apk的下载包的地方,更新该链接抓取状态为1
         update_crawled_sql = sql.topicurl_withcrawed(response.url)
         status = handler_db(update_crawled_sql)
         if status['errorNo'] == 1:
             log.msg('this url has no apk', log.INFO) 
         return
예제 #15
0
 def get_topic(self, response):
     #根据时间,取为下载的部分进行回复
     
     topic_urls = sql.get_topic_ranone(self.platform, '0')
     url = handle_db(topic_urls)['data'][0]
     
     request = response.request.replace(url = url[0], method='get')
     request.callback = self.get_code
     #request.meta['crawled'] = url[1]
     request.meta['topic_url'] = url[0]
     #将登陆后这个response对象存起来,方便后面回调本方法,在传入这个对象
     request.meta['topic_response'] = response
     return request
예제 #16
0
 def reply_status(self, response):
     #重新返回首页,对特别是需要回复的模块特别有用,而不需要回复的帖子缺对性能是个损失.
     #查询回复状态
     
     success = '非常感谢,您的回复已经发布'
     status = re.findall(success, response.body)
     username_sql = sql.select_accountbyusername(self.username, self.platform)
     
     if status:
         log.msg('reply success', log.INFO) 
         reply_nums = handle_db(username_sql)['data'][0]
         self.point_num = reply_nums[5]
         self.reply_num = reply_nums[-2]
         self.reply_num += 1
         self.point_num += 1
         #回复成功.账号reply_num+1,积分+1
         try:
             update_replynum = sql.update_account_username(self.username, self.platform, self.reply_num, self.point_num)
             update_success = handle_db(update_replynum)
             log.msg(('<username: %s \'s integral is : +1 ,now integral is %s>' % (self.username, self.reply_num)), log.INFO)
         except Exception, e:
             log.msg(str(traceback.print_exc()), log.ERROR)
예제 #17
0
 def parse_item(self, response):
     #解析出需要下载的包
     log.msg(response.url, log.INFO)
     try:
         topic_url = response.request.meta['topic_url']
         if re.findall('指定的主题不存在或已被删除或正在被审核', response.body, re.I):
             #执行更新topic操作
             log.msg(('this topic_url(%s) has be removed by admin' % response.url), log.INFO) 
             update_topic_url = sql.topicurl_withcrawed(topic_url)
             status = handle_db(update_topic_url)
             
             return
         topic_url = response.request.meta['topic_url']
     except:
         log.msg(str(traceback.print_exc()), log.INFO)
     try:
         urls = re.findall('<a href="(attachment.php?[^"]+)".*?>.*\.apk', response.body, re.I)
         if urls == []:
             log.msg(('this url->%s has not apk file' % response.request.meta['topic_url']), log.INFO) 
             #如果没有apk文件,该主题链接失去意义,更新is_crawled=1
             update_topic_url = sql.topicurl_withcrawed(topic_url)
             status = handle_db(update_topic_url)
             return
         else:
             for url in set(urls):
                 url = 'http://bbs.gfan.com/%s' % url
                 request =  response.request.replace(url=url, callback=self.get_attachementpath)
                 request.meta['url'] = response.url
                 yield request
         
     except IndexError, e:
         log.msg(str(traceback.print_exc()), log.ERROR)
         #没有 apk的下载包的地方,更新该链接抓取状态为1
         update_crawled_sql = sql.topicurl_withcrawed(response.url)
         status = handler_db(update_crawled_sql)
         if status['errorNo'] == 1:
             log.msg('this url has no apk', log.INFO) 
         return
예제 #18
0
    def get_apk(self, response):

        filename = ''.join([str(random.randrange(1,100000)), '.ipa'])
#        if os.path.exists(''.join([os.getcwd(), os.sep, 'hiapk', os.sep, 'download', os.sep, time.strftime('%Y-%m-%d'),os.sep])) is False:
#            os.mkdir(''.join([os.getcwd(), os.sep, 'hiapk', os.sep, 'download', os.sep, time.strftime('%Y-%m-%d'),os.sep]))
        #调用新建文件夹的方法
        down_dir = utils.make_spiderdir(self.platform, 'download')
        #解码:有的文件可能是utf-8编码,解为unicode
        try:
            filename = filename.decode('utf-8', 'ignore')
        except:
            pass
        filename = ''.join([down_dir, os.sep, str(time.time()).split('.')[0], filename])
        
        
        
        #下载后在存数据库,确保准确性
        topic_url = response.request.url
        hashurl = sql.hash_topic(topic_url)
        updatedate = time.strftime('%Y-%m-%d %H:%M:%S')
        filename = filename.replace('\\', '\\\\')
        #插入数据库主题页面
        insert_topic_sql = sql.insert_topic_url(self.name, topic_url, '', updatedate, '1', '0')
        topic_status = handle_db(insert_topic_sql)
        if topic_status['errorNo'] == -1:
            raise ValueError("this ipa file has download in my databases")
        open(filename, 'wb').write(response.body)
        insert_sql = sql.insert_softwareinfo(self.platform, hashurl, topic_url, updatedate, filename)
        status = handle_db(insert_sql)
        log.msg(str(status), log.DEBUG) 

        try:
            autocopy.copy(filename, self.platform)
            log.msg('copy job is successed', log.INFO)
        except:
            log.msg(str(traceback.print_exc()), log.ERROR)
            log.msg('copy job is failture', log.ERROR)
예제 #19
0
 def reply_status(self, response):
     #重新返回首页,对特别是需要回复的模块特别有用,而不需要回复的帖子缺对性能是个损失.
     #查询回复状态
     
     success = '非常感谢,您的回复已经发布'
     status = re.findall(success, response.body)
     print self.username
     username_sql = sql.select_accountbyusername(self.username, self.platform)
     
     if status:
         print 'reply success'
         reply_nums = handle_db(username_sql)['data'][0]
         
         self.reply_num = reply_nums[-2]
         self.reply_num += 1
         #回复成功.账号reply_num+1
         try:
             now_time = time.strftime('%Y-%m-%d %H:%M:%S')
             update_replynum = sql.update_account_username(self.username, now_time, self.platform, self.reply_num)
             update_success = handle_db(update_replynum)
             print update_success
             print 'update success! user\'s reply_num + 1'
         except Exception, e:
             traceback.print_exc()
예제 #20
0
 def get_topic(self, response):
     #随机选取一个下载地址,进行回复下载.
     
     print 'get_topic--------->hello world'
     topic_urls = sql.get_topic_bycrawed(self.platform, '0')
     urls = handle_db(topic_urls)['data']
     
     for url in urls:
         print 'len(urls)---------->',url
         request = response.request.replace(url = url[0], method='get', meta={'proxy' : 'http://119.115.136.226:443'})
         request.callback = self.get_code
         request.meta['crawled'] = url[1] 
         request.meta['topic_url'] = url[0]
         #request.meta['proxy'] = self.proxy_url
         yield request
예제 #21
0
 def get_downloadpath(self, response):
     topic_url = response.request.meta['topic_url']
     log.msg(('get_downloadpath,topic_url', response.request.meta['topic_url']), log.INFO) 
     try:
         #获取下载地址的前一个地址是response.url
         url = re.findall(u'<a href="(http://bbs\.goapk\.com/forum\.php\?mod=attachment.*aid=.*)">', response.body, re.I)[0]
         #文件名,zanshi
         url = url.replace('amp;', '')
         request = response.request.replace(url=url, callback=self.get_apk)
         return request
     except:
         traceback.print_exc()
         #这里发生异常,表示抓取不到该文件名吗?
         update_topic_url = sql.topicurl_withcrawed(topic_url)
         status = handle_db(update_topic_url)
         print status
예제 #22
0
 def process_item(self, item, spider):
     #对传递过来的item进行解析
     log.msg(str(item), log.DEBUG)
     try:
         topic_url = item['topic_url']
         referer_url = item['referer_url']
         spider_name = item['spider_name']
         update_date = time.strftime('%Y-%m-%d %H:%M:%S')
         is_crawled = 0
         priority_rating = 5
         sql = insert_topic_url(spider_name, topic_url,referer_url, update_date, is_crawled, priority_rating)
         n = handle_db(sql)
     except:
         log.msg(str(traceback.print_exc()), log.DEBUG)
     log.msg('insert a message' + str(n), log.DEBUG)
     return item
예제 #23
0
 def __init__(self):
     #取用户
     account_message = get_account(self.platform)
     self.username =  account_message[2]
     self.password = account_message[3]        
     self.reply_num = 0
     
     #爬虫启动前,执行更新用户使用状态的的方法
     update_avail_sql = sql.update_use_byusernamesql(self.username, self.platform, '1')
     n = handle_db(update_avail_sql)
     log.msg(('<username : %s > is being use' % self.username), log.DEBUG)
     
     #从settings.py文件中取得要回复帖子的数量 
     self.topic_reply_num = settings.REPLY_NUM
     
     #本对象一旦接手到爬虫结束的信号,调用第一个参数这个方法
     dispatcher.connect(self.user_relax, signals.spider_closed)
예제 #24
0
 def get_code(self, response):
     
     #网站管理员对主题页内容进行了删除操作
     if re.findall('指定的主题不存在或已被删除或正在被审核', response.body, re.I):
         #执行更新topic操作
         print 'this topic_url(%s) has be removed by admin' % response.url
         update_topic_url = sql.topicurl_withcrawed(topic_url)
         status = handle_db(update_topic_url)
         print status
         return
     print 'response.url',response.url
     request_code = response.request.replace(url = 'http://bbs.gfan.com/ajax.php?action=newupdatesecqaa&inajax=0&ajaxtarget=secanswer_menu_content',\
                                             callback=self.ay_code)
     request_code.meta['response'] = response
     #request_code.meta['proxy'] = self.proxy_url
     response.request.meta['response'] = response
     yield request_code
예제 #25
0
 def get_downloadpath(self, response):
     print 'get_downloadpath,topic_url', response.request.meta['topic_url']
     try:
         #获取下载地址的前一个地址是response.url
         url = re.findall('<p class="alert_btnleft"><a href="(.*?)">', response.body, re.I)[0]
         url = 'http://bbs.gfan.com/%s' %  url
         #文件名,zanshi
         file_name = re.findall('<div class="alert_info">\s+<p>.*?“(.*?)”', response.body, re.I)[0]
         request = response.request.replace(url=url, callback=self.get_apk)
         request.meta['filename'] = file_name
         return request
     except:
         traceback.print_exc()
         #这里发生异常,表示抓取不到该文件名吗?
         update_topic_url = sql.topicurl_withcrawed(topic_url)
         status = handle_db(update_topic_url)
         print status
예제 #26
0
 def get_downloadpath(self, response):
     topic_url = response.request.meta['topic_url']
     log.msg(('get_downloadpath,topic_url', response.request.meta['topic_url']), log.INFO) 
     try:
         #获取下载地址的前一个地址是response.url
         url = re.findall('<a href="(attachment.php?[^"]+)".*?>', response.body, re.I)[0]
         url = 'http://bbs.gfan.com/%s' %  url
         #文件名,zanshi
         file_name = re.findall('“.*\.(.*?)”', response.body, re.I)[0]
         request = response.request.replace(url=url, callback=self.get_apk)
         request.meta['filename'] = file_name
         return request
     except:
         traceback.print_exc()
         #这里发生异常,表示抓取不到该文件名吗?
         update_topic_url = sql.topicurl_withcrawed(topic_url)
         status = handle_db(update_topic_url)
         print status
예제 #27
0
    def get_code(self, response):
        
        if self.topic_reply_num < 1:
            return
        time.sleep(20)
        #网站管理员对主题页内容进行了删除操作
        if re.findall('指定的主题不存在或已被删除或正在被审核'.decode('utf-8', 'ignore').encode('utf-8', 'ignore'), response.body, re.I):
            #执行更新topic操作
            log.msg(('this topic_url(%s) has be removed by admin' % response.url), log.INFO) 
            update_topic_url = sql.topicurl_withcrawed(topic_url)
            status = handle_db(update_topic_url)
            #回调本方法
            request_topic = self.repeat_reply(response)
            return request_topic

        message = self.get_othermessage(response)
        if len(message) < 10:
            message = ''.join([re.sub('<.*?>','', message), '..........'] )
        try:
            posttime = str(time.time()).split('.')[0]
            hash = getFormHash(response.body)
            formdata = {"message" : message,'posttime' : posttime,
                            'formhash' : hash, 'subject' : ''}
            formrequest = FormRequest.from_response(response=response, formnumber=1,formdata = formdata,dont_click=True,)

            formrequest.callback = self.reply_status
            print 'response.url---->',response.url
            #url = self.get_reply_url(response)
            #if url:
            #    formrequest._url = url
            #else:
            #    self.repeat_reply(response)
            
            #formrequest._url = '%s%s' % (formrequest.url, '&inajax=1')
            formrequest.meta['topic_url'] = response.request.meta['topic_url']
            formrequest.meta['topic_response'] = response.request.meta['topic_response']
            #formrequest.meta['proxy'] = self.proxy_url
            return [formrequest]
        except:
            #发生无法回复的异常如何处理?比如无form,发生IndexError错误.回调get_topic
            request_topic = self.reply_status(response)
            return [request_topic]
예제 #28
0
 def get_integral_page(self, response):
     #正则匹配相应的积分
     try:
         integral = re.findall(u'<li><em>金币</em>(.*?)</li>'.encode('gbk', 'ignore'), response.body, re.I)[0].replace(' ', '')
         if integral:
             #如果取到相应的积分,执行判断该积分是否>20,小于20,更新数据库,跳出,大于20,更新数据库,向下执行
             update_user_integral_sql = sql.update_account_point(self.username, self.platform, integral)
             n = handle_db(update_user_integral_sql)
             log.msg(('update user(%s)\'s integral is: %s, %s' % (self.username, integral, n)), log.INFO)
             #回复加下载模块并行,不在使用积分低于多少不能下载机制。因为这里会导致不在回复 。
             request = self.get_topic(response)
             return request
         else:
             log.msg('cann\'t get user\'s integral', log.ERROR)
             request = self.get_topic(response)
             return request
     except:
         log.msg(str(traceback.print_exc()), log.ERROR)
         request = self.get_topic(response)
         return request
예제 #29
0
 def get_code(self, response):
     
     if self.topic_reply_num < 1:
         return
     time.sleep(20)
     #网站管理员对主题页内容进行了删除操作
     if re.findall('指定的主题不存在或已被删除或正在被审核', response.body, re.I):
         #执行更新topic操作
         topic_url = response.request.meta['topic_url']
         log.msg(('this topic_url(%s) has be removed by admin' % response.url), log.INFO) 
         update_topic_url = sql.topicurl_withcrawed(topic_url)
         status = handle_db(update_topic_url)
         #回调本方法
         request_topic = self.repeat_reply(response)
         return request_topic
     log.msg(('response.url',response.url), log.INFO) 
     request_code = response.request.replace(url = 'http://bbs.gfan.com/ajax.php?action=newupdatesecqaa&inajax=0&ajaxtarget=secanswer_menu_content',\
                                             callback=self.ay_code)
     request_code.meta['response'] = response
     #request_code.meta['proxy'] = self.proxy_url
     #response.request.meta['response'] = response
     return request_code
예제 #30
0
 def get_downloadurl(self, response):
     '''
                 将主题获取的内容进行apk正则表达式解析,
                 如果根据下载地址去下载页面的化,现在不在可控范围,即会下很多无用包
     '''
     try:
         down_urls = re.findall('<a href="(.*?)">.*\.(apk)</a>', response.body)
         #含有下载附近的页面,插入topicdata表中,插入成功,表示该页面未抓取过,同时真实下载后更新之
         if any(down_urls):
             try:
                 update_time = time.strftime('%Y-%m-%d %H:%M:%S')
                 insert_topic_sql = sql.insert_topic_url(self.name, response.url, response.request.url, update_time, '0', '0')
                 log.msg(insert_topic_sql, log.INFO)
                 status = db.handle_db(insert_topic_sql)
                 if status['errorNo'] == 0:
                     log.msg('this url can be use', log.INFO)
                 else:
                     log.msg('this url cann\'t be used', log.INFO)
                     #return的关键作用是已经抓取过的主题页面不在抓取,保证不重复抓取软件包.
                     return
             except:
                 log.msg(str(traceback.print_exc()), log.INFO)
                 #预计发生唯一索引错误。即数据库中已经有该数据了
                 return
         for url in down_urls:
             print url
             filename = url[1]
             url = '%s%s' % ('http://forum.xda-developers.com/', url[0])
             #生成的url先放在topicdata表中,在每次重复抓一个主题页时,
             request = Request(url = url, callback = self.get_apk)
             #将topic_url存入到meta属性中
             request._meta = {'filename' : filename, 'topic_url' : response.url}
             yield request
     except:
         log.msg(str(traceback.print_exc()), log.INFO)
         return