Пример #1
0
 def parse_item(self, response):
     #解析出需要下载的包
     topic_url = response.request.meta['topic_url']
     try:
         urls = re.findall('<a.*?href="(.*?)".*?>.*\.[apk|zip|rar].*?</a>', response.body, re.I)
         print urls
         if urls == []:
             print 'this url->%s has not apk file' % response.request.meta['topic_url']
             #如果没有apk文件,该主题链接失去意义,更新is_crawled=1
             update_topic_url = sql.topicurl_withcrawed(topic_url)
             status = handle_db(update_topic_url)
             print status
         for url in set(urls):
             url = 'http://bbs.gfan.com/%s' % url
             request =  response.request.replace(url=url, callback=self.get_attachementpath)
             request.meta['url'] = response.url
             yield request
         return
     except IndexError, e:
         traceback.print_exc()
         #没有 apk的下载包的地方,更新该链接抓取状态为1
         update_crawled_sql = sql.topicurl_withcrawed(response.url)
         status = handler_db(update_crawled_sql)
         if status['errorNo'] == 1:
             print 'this url has no apk'
         return
Пример #2
0
 def parse_item(self, response):
     #解析出需要下载的包
     log.msg(response.url, log.INFO)
     try:
         topic_url = response.request.meta['topic_url']
         if re.findall(u'指定的主题不存在或已被删除或正在被审核,请返回'.encode('gbk', 'ignore'), response.body, re.I):
             #执行更新topic操作
             log.msg(('this topic_url(%s) has be removed by admin' % response.url), log.INFO) 
             update_topic_url = sql.topicurl_withcrawed(topic_url)
             status = handle_db(update_topic_url)
             return
         topic_url = response.request.meta['topic_url']
     except:
         log.msg(str(traceback.print_exc()), log.INFO)
     try:
         urls = re.findall('<a href="(.*?)".*\.apk</a>', response.body, re.I)
         if urls == []:
             #必须先回复在下载的数据
             reply_urls = re.findall(u'如果你要查看本帖隐藏内容请'.encode('gbk', 'ignore'), response.body, re.I)
             #判断是否有匹配必须回复的,有就更新抓取级别为-1,单独使用回复加下载一体模块查找级别为-1的模块
             print 'reply_urls'
             print reply_urls
             if reply_urls != []:
                 update_topic_priority_sql = sql.update_topic_priority(self.platform, topic_url, '-1')
                 n = handle_db(update_topic_priority_sql)
                 log.msg(str(n), log.INFO)
                 return
             log.msg(('this url->%s has not apk file' % response.request.meta['topic_url']), log.INFO) 
             #如果没有apk文件,该主题链接失去意义,更新is_crawled=1
             update_topic_url = sql.topicurl_withcrawed(topic_url)
             status = handle_db(update_topic_url)
             #回复成功后,没有下载链接,本次登陆错操作不能中止,继续执行
             self.topic_reply_num -= 1
             log.msg(('reply success, will download software%s' % str(self.topic_reply_num)), log.DEBUG)
             log.msg(str(response), log.DEBUG)
             request = self.get_down_topic(response, url)
             yield request
         else:
             for url in set(urls):
                 url = url.replace('amp;', '')
                 print 'url:', url
                 request =  response.request.replace(url=url, callback=self.get_downloadpath)
                 request.meta['url'] = response.url
                 yield request
         
     except IndexError, e:
         log.msg(str(traceback.print_exc()), log.ERROR)
         #没有 apk的下载包的地方,更新该链接抓取状态为1
         update_crawled_sql = sql.topicurl_withcrawed(response.url)
         status = handler_db(update_crawled_sql)
         if status['errorNo'] == 1:
             log.msg('this url has no apk', log.INFO) 
         return
Пример #3
0
 def parse_item(self, response):
     #解析出需要下载的包
     log.msg(response.url, log.INFO)
     try:
         if re.findall(u'抱歉,指定的主题不存在或已被删除或正在被审核'.encode('gbk', 'ignore'), response.body, re.I):
             #执行更新topic操作
             log.msg(('this topic_url(%s) has be removed by admin' % response.url), log.INFO) 
             update_topic_url = sql.topicurl_withcrawed(topic_url)
             status = handle_db(update_topic_url)
             
             return
         topic_url = response.request.meta['topic_url']
     except:
         log.msg(str(traceback.print_exc()), log.INFO)
     try:
         urls = re.findall('<a href="(forum.php?[^"]+)" target="_blank">.*?\.apk</a>', response.body, re.I)
         if urls == []:
             #没有回复不能下载
             noreply_regex = u'如果您要查看本帖隐藏内容请'.encode('gbk', 'ignore')
             noreply = re.findall(noreply_regex, response.body, re.I)
             for i in noreply:
                 print i.decode('gbk','ignore')
             if noreply != []:
                 #需要执行更新topic方法,avail字段为-1
                 update_topic_priority_sql = sql.update_topic_priority(self.platform, topic_url, '-1')
                 n = handle_db(update_topic_priority_sql)
                 log.msg(''.join(['hide_apk_update topic_url priority=-1', str(n)]), log.INFO)
                 return
             else:
                 log.msg(('this url->%s has not apk file' % response.request.meta['topic_url']), log.INFO) 
                 #如果没有apk文件,该主题链接失去意义,更新is_crawled=1
                 update_topic_url = sql.topicurl_withcrawed(topic_url)
                 status = handle_db(update_topic_url)
                 return
         else:
             for url in set(urls):
                 url = 'http://bbs.mumayi.com/%s' % url
                 request =  response.request.replace(url=url, callback=self.get_apk)
                 request.meta['url'] = response.url
                 yield request
         
     except IndexError, e:
         log.msg(str(traceback.print_exc()), log.ERROR)
         #没有 apk的下载包的地方,更新该链接抓取状态为1
         update_crawled_sql = sql.topicurl_withcrawed(response.url)
         status = handler_db(update_crawled_sql)
         if status['errorNo'] == 1:
             log.msg('this url has no apk', log.INFO) 
         return
Пример #4
0
    def get_apk(self, response):
        '''
                        将下载地址下载的包写入文件,并且
        '''
        filename = response.request.meta['filename']
        filename = ''.join([str(random.randrange(1,100000)), '.', filename])
#        if os.path.exists(''.join([os.getcwd(), os.sep, 'xda', os.sep, 'download', os.sep, time.strftime('%Y-%m-%d'),os.sep])) is False:
#            os.mkdir(''.join([os.getcwd(), os.sep, 'xda', os.sep, 'download', os.sep, time.strftime('%Y-%m-%d'),os.sep]))
        #调用新建文件夹的方法
        down_dir = utils.make_spiderdir(self.name, 'download')
        #解码:有的文件可能是utf-8编码,解为unicode
        try:
            filename = filename.decode('utf-8', 'ignore')
        except:
            pass
        filename = ''.join([down_dir, os.sep, str(time.time()).split('.')[0], filename])
        open(filename, 'wb').write(response.body)
        #下载后在存数据库,确保准确性
        topic_url = response.request.meta['topic_url']
        hashurl = sql.hash_topic(topic_url)
        updatedate = time.strftime('%Y-%m-%d %H:%M:%S')
        filename = filename.replace('\\', '\\\\')
        insert_sql = sql.insert_softwareinfo(self.name, hashurl, topic_url, updatedate, filename)
        status = db.handle_db(insert_sql)
        log.msg(str(status), log.DEBUG) 
        #更新topic数据库表
        update_topic_url = sql.topicurl_withcrawed(topic_url)
        status = db.handle_db(update_topic_url)
        log.msg(str(status), log.DEBUG)
        #备份目录
        try:
            autocopy.copy(filename, self.name)
            log.msg('copy job is successed', log.INFO)
        except:
            log.msg('copy job is failture', log.ERROR)
Пример #5
0
 def parse_item(self, response):
     #解析出需要下载的包
     log.msg(response.url, log.INFO)
     try:
         if re.findall(u'抱歉,指定的主题不存在或已被删除或正在被审核'.encode('gbk', 'ignore'), response.body, re.I):
             #执行更新topic操作
             log.msg(('this topic_url(%s) has be removed by admin' % response.url), log.INFO) 
             update_topic_url = sql.topicurl_withcrawed(topic_url)
             status = handle_db(update_topic_url)
             
             return
         topic_url = response.request.meta['topic_url']
     except:
         log.msg(str(traceback.print_exc()), log.INFO)
     try:
         urls = re.findall('<a href="(forum.php?[^"]+)".*>.*?\.apk', response.body, re.I)
         if urls == []:
             #没有回复不能下载
             log.msg(('this url->%s has not apk file' % response.request.meta['topic_url']), log.INFO) 
             #如果没有apk文件,该主题链接失去意义,更新is_crawled=1
             update_topic_url = sql.topicurl_withcrawed(topic_url)
             status = handle_db(update_topic_url)
             request_topic = self.repeat_reply(response)
             yield request_topic
         else:
             for url in set(urls):
                 url = 'http://bbs.mumayi.com/%s' % url
                 url = url.replace('amp;', '')
                 request =  response.request.replace(url=url, method='get', callback=self.get_apk)
                 request.meta['url'] = response.url
                 yield request
         
     except IndexError, e:
         log.msg(str(traceback.print_exc()), log.ERROR)
         #没有 apk的下载包的地方,更新该链接抓取状态为1
         update_crawled_sql = sql.topicurl_withcrawed(response.url)
         status = handler_db(update_crawled_sql)
         if status['errorNo'] == 1:
             log.msg('this url has no apk', log.INFO) 
         return
Пример #6
0
 def parse_item(self, response):
     #解析出需要下载的包
     log.msg(response.url, log.INFO)
     try:
         topic_url = response.request.meta['topic_url']
         if re.findall('指定的主题不存在或已被删除或正在被审核', response.body, re.I):
             #执行更新topic操作
             log.msg(('this topic_url(%s) has be removed by admin' % response.url), log.INFO) 
             update_topic_url = sql.topicurl_withcrawed(topic_url)
             status = handle_db(update_topic_url)
             
             return
         topic_url = response.request.meta['topic_url']
     except:
         log.msg(str(traceback.print_exc()), log.INFO)
     try:
         urls = re.findall('<a href="(attachment.php?[^"]+)".*?>.*\.apk', response.body, re.I)
         if urls == []:
             log.msg(('this url->%s has not apk file' % response.request.meta['topic_url']), log.INFO) 
             #如果没有apk文件,该主题链接失去意义,更新is_crawled=1
             update_topic_url = sql.topicurl_withcrawed(topic_url)
             status = handle_db(update_topic_url)
             return
         else:
             for url in set(urls):
                 url = 'http://bbs.gfan.com/%s' % url
                 request =  response.request.replace(url=url, callback=self.get_attachementpath)
                 request.meta['url'] = response.url
                 yield request
         
     except IndexError, e:
         log.msg(str(traceback.print_exc()), log.ERROR)
         #没有 apk的下载包的地方,更新该链接抓取状态为1
         update_crawled_sql = sql.topicurl_withcrawed(response.url)
         status = handler_db(update_crawled_sql)
         if status['errorNo'] == 1:
             log.msg('this url has no apk', log.INFO) 
         return
Пример #7
0
 def get_downloadpath(self, response):
     topic_url = response.request.meta['topic_url']
     log.msg(('get_downloadpath,topic_url', response.request.meta['topic_url']), log.INFO) 
     try:
         #获取下载地址的前一个地址是response.url
         url = re.findall(u'<a href="(http://bbs\.goapk\.com/forum\.php\?mod=attachment.*aid=.*)">', response.body, re.I)[0]
         #文件名,zanshi
         url = url.replace('amp;', '')
         request = response.request.replace(url=url, callback=self.get_apk)
         return request
     except:
         traceback.print_exc()
         #这里发生异常,表示抓取不到该文件名吗?
         update_topic_url = sql.topicurl_withcrawed(topic_url)
         status = handle_db(update_topic_url)
         print status
Пример #8
0
 def get_downloadpath(self, response):
     print 'get_downloadpath,topic_url', response.request.meta['topic_url']
     try:
         #获取下载地址的前一个地址是response.url
         url = re.findall('<p class="alert_btnleft"><a href="(.*?)">', response.body, re.I)[0]
         url = 'http://bbs.gfan.com/%s' %  url
         #文件名,zanshi
         file_name = re.findall('<div class="alert_info">\s+<p>.*?“(.*?)”', response.body, re.I)[0]
         request = response.request.replace(url=url, callback=self.get_apk)
         request.meta['filename'] = file_name
         return request
     except:
         traceback.print_exc()
         #这里发生异常,表示抓取不到该文件名吗?
         update_topic_url = sql.topicurl_withcrawed(topic_url)
         status = handle_db(update_topic_url)
         print status
Пример #9
0
 def get_code(self, response):
     
     #网站管理员对主题页内容进行了删除操作
     if re.findall('指定的主题不存在或已被删除或正在被审核', response.body, re.I):
         #执行更新topic操作
         print 'this topic_url(%s) has be removed by admin' % response.url
         update_topic_url = sql.topicurl_withcrawed(topic_url)
         status = handle_db(update_topic_url)
         print status
         return
     print 'response.url',response.url
     request_code = response.request.replace(url = 'http://bbs.gfan.com/ajax.php?action=newupdatesecqaa&inajax=0&ajaxtarget=secanswer_menu_content',\
                                             callback=self.ay_code)
     request_code.meta['response'] = response
     #request_code.meta['proxy'] = self.proxy_url
     response.request.meta['response'] = response
     yield request_code
Пример #10
0
    def get_apk(self, response):

        filename = ''.join([str(random.randrange(1,100000)), '.apk'])
#        if os.path.exists(''.join([os.getcwd(), os.sep, 'gfan', os.sep, 'download', os.sep, time.strftime('%Y-%m-%d'),os.sep])) is False:
#            os.mkdir(''.join([os.getcwd(), os.sep, 'gfan', os.sep, 'download', os.sep, time.strftime('%Y-%m-%d'),os.sep]))
        #调用新建文件夹的方法
        down_dir = utils.make_spiderdir(self.platform, 'download')
        #解码:有的文件可能是utf-8编码,解为unicode
        try:
            filename = filename.decode('utf-8', 'ignore')
        except:
            pass
        filename = ''.join([down_dir, os.sep, str(time.time()).split('.')[0], filename])
        open(filename, 'wb').write(response.body)
        #下载后在存数据库,确保准确性
        topic_url = response.request.meta['topic_url']
        hashurl = sql.hash_topic(topic_url)
        updatedate = time.strftime('%Y-%m-%d %H:%M:%S')
        filename = filename.replace('\\', '\\\\')
        insert_sql = sql.insert_softwareinfo(self.platform, hashurl, topic_url, updatedate, filename)
        status = handle_db(insert_sql)
        log.msg(str(status), log.DEBUG) 
        #更新topic数据库表
        update_topic_url = sql.topicurl_withcrawed(topic_url)
        status = handle_db(update_topic_url)
        log.msg(str(status), log.DEBUG)

        #能进入本方法执行,表示已经下载了该response.积分-1
        account_sql = sql.select_accountbyusername(self.username, self.platform)
        point_num = handle_db(account_sql)['data'][0][5]
        point_num -= 1
        #然后执行更新
        update_account_pointsql = sql.update_account_point(self.username, self.platform, point_num)
        n = handle_db(update_account_pointsql)
        if n['errorNo'] == 0:
            log.msg(('<username: %s \'s integral is : -1 ,now integral is %s>' % (self.username, self.reply_num)), log.INFO)
        try:
            autocopy.copy(filename, self.platform)
            log.msg('copy job is successed', log.INFO)
        except:
            log.msg(str(traceback.print_exc()), log.ERROR)
            log.msg('copy job is failture', log.ERROR)
        
        request_topic = self.repeat_reply(response)
        return request_topic
Пример #11
0
 def get_downloadpath(self, response):
     topic_url = response.request.meta['topic_url']
     log.msg(('get_downloadpath,topic_url', response.request.meta['topic_url']), log.INFO) 
     try:
         #获取下载地址的前一个地址是response.url
         url = re.findall('<a href="(attachment.php?[^"]+)".*?>', response.body, re.I)[0]
         url = 'http://bbs.gfan.com/%s' %  url
         #文件名,zanshi
         file_name = re.findall('“.*\.(.*?)”', response.body, re.I)[0]
         request = response.request.replace(url=url, callback=self.get_apk)
         request.meta['filename'] = file_name
         return request
     except:
         traceback.print_exc()
         #这里发生异常,表示抓取不到该文件名吗?
         update_topic_url = sql.topicurl_withcrawed(topic_url)
         status = handle_db(update_topic_url)
         print status
Пример #12
0
 def get_apk(self, response):
     filename = response.request.meta['filename']
     
     if os.path.exists(''.join([os.getcwd(), os.sep, 'gfan', os.sep, 'download', os.sep, time.strftime('%Y-%m-%d'),os.sep])) is False:
         os.mkdir(''.join([os.getcwd(), os.sep, 'gfan', os.sep, 'download', os.sep, time.strftime('%Y-%m-%d'),os.sep]))
     filename = ''.join([os.getcwd(), os.sep, 'gfan', os.sep, 'download', os.sep, time.strftime('%Y-%m-%d'), os.sep, str(time.time()).split('.')[0], filename])
     open(filename, 'wb').write(response.body)
     #下载后在存数据库,确保准确性
     topic_url = response.request.meta['topic_url']
     hashurl = sql.hash_topic(topic_url)
     updatedate = time.strftime('%Y-%m-%d %H:%M:%S')
     filename = filename.replace('\\', '\\\\')
     insert_sql = sql.insert_softwareinfo(self.platform, hashurl, topic_url, updatedate, filename)
     status = handle_db(insert_sql)
     print status 
     #更新topic数据库表
     update_topic_url = sql.topicurl_withcrawed(topic_url)
     status = handle_db(update_topic_url)
     print status
Пример #13
0
    def get_code(self, response):
        
        if self.topic_reply_num < 1:
            return
        time.sleep(20)
        #网站管理员对主题页内容进行了删除操作
        if re.findall('指定的主题不存在或已被删除或正在被审核'.decode('utf-8', 'ignore').encode('utf-8', 'ignore'), response.body, re.I):
            #执行更新topic操作
            log.msg(('this topic_url(%s) has be removed by admin' % response.url), log.INFO) 
            update_topic_url = sql.topicurl_withcrawed(topic_url)
            status = handle_db(update_topic_url)
            #回调本方法
            request_topic = self.repeat_reply(response)
            return request_topic

        message = self.get_othermessage(response)
        if len(message) < 10:
            message = ''.join([re.sub('<.*?>','', message), '..........'] )
        try:
            posttime = str(time.time()).split('.')[0]
            hash = getFormHash(response.body)
            formdata = {"message" : message,'posttime' : posttime,
                            'formhash' : hash, 'subject' : ''}
            formrequest = FormRequest.from_response(response=response, formnumber=1,formdata = formdata,dont_click=True,)

            formrequest.callback = self.reply_status
            print 'response.url---->',response.url
            #url = self.get_reply_url(response)
            #if url:
            #    formrequest._url = url
            #else:
            #    self.repeat_reply(response)
            
            #formrequest._url = '%s%s' % (formrequest.url, '&inajax=1')
            formrequest.meta['topic_url'] = response.request.meta['topic_url']
            formrequest.meta['topic_response'] = response.request.meta['topic_response']
            #formrequest.meta['proxy'] = self.proxy_url
            return [formrequest]
        except:
            #发生无法回复的异常如何处理?比如无form,发生IndexError错误.回调get_topic
            request_topic = self.reply_status(response)
            return [request_topic]
Пример #14
0
 def get_code(self, response):
     
     if self.topic_reply_num < 1:
         return
     time.sleep(20)
     #网站管理员对主题页内容进行了删除操作
     if re.findall('指定的主题不存在或已被删除或正在被审核', response.body, re.I):
         #执行更新topic操作
         topic_url = response.request.meta['topic_url']
         log.msg(('this topic_url(%s) has be removed by admin' % response.url), log.INFO) 
         update_topic_url = sql.topicurl_withcrawed(topic_url)
         status = handle_db(update_topic_url)
         #回调本方法
         request_topic = self.repeat_reply(response)
         return request_topic
     log.msg(('response.url',response.url), log.INFO) 
     request_code = response.request.replace(url = 'http://bbs.gfan.com/ajax.php?action=newupdatesecqaa&inajax=0&ajaxtarget=secanswer_menu_content',\
                                             callback=self.ay_code)
     request_code.meta['response'] = response
     #request_code.meta['proxy'] = self.proxy_url
     #response.request.meta['response'] = response
     return request_code
Пример #15
0
 def pass_topic(self, response):
     #查找含有下载链接的主题页面
     noauth = re.findall(('%s' % settings.TRAIT).decode('utf-8', 'ignore').encode('%s' % settings.CODE), response.body, re.I)
     referer_url = response.request.headers.get('Referer', None)
     if noauth:
         i = SrcItem()
         #第一次访问过此页面
         topic_url = response.url
         i['topic_url'] = topic_url
         
         i['referer_url'] = referer_url
         i['spider_name'] = settings.SPIDERNAME
         yield i
     else:
         try:
             urls = re.findall(settings.DOWN_REG, response.body, re.I)
             if urls == []:
                 log.msg(('this url->%s has not apk file' % response.url), log.INFO) 
                 yield
             else:
                 for url in set(urls):
                     url = '%s%s' % (settings.DOMAIN, url)
                     print 'download_url', url
                     url = url.replace('amp;', '')
                     request =  response.request.replace(url=url, method='get', callback=self.get_apk)
                     request.meta['referer_url'] = referer_url
                     yield request
             
         except IndexError, e:
             log.msg(str(traceback.print_exc()), log.ERROR)
             #没有 apk的下载包的地方,更新该链接抓取状态为1
             update_crawled_sql = sql.topicurl_withcrawed(response.url)
             status = handler_db(update_crawled_sql)
             if status['errorNo'] == 1:
                 log.msg('this url has no apk', log.INFO) 
             yield