示例#1
0
    def get_apk(self, response):
        topic_url = referer_url = response.request.meta['referer_url']
        update_date = time.strftime('%Y-m-%d %H:%M:%S')
        is_crawled = '1'
        priority_rating = '0'
        filename = ''.join([str(random.randrange(1,100000)), '.apk'])
#        if os.path.exists(''.join([os.getcwd(), os.sep, 'gfan', os.sep, 'download', os.sep, time.strftime('%Y-%m-%d'),os.sep])) is False:
#            os.mkdir(''.join([os.getcwd(), os.sep, 'gfan', os.sep, 'download', os.sep, time.strftime('%Y-%m-%d'),os.sep]))
        #调用新建文件夹的方法
        down_dir = utils.make_spiderdir(self.platform, 'download')
        #解码:有的文件可能是utf-8编码,解为unicode
        try:
            filename = filename.decode('utf-8', 'ignore')
        except:
            pass
        filename = ''.join([down_dir, os.sep, str(time.time()).split('.')[0], filename])
        open(filename, 'wb').write(response.body)
        #下载后在存数据库,确保准确性
        hashurl = sql.hash_topic(topic_url)
        updatedate = time.strftime('%Y-%m-%d %H:%M:%S')
        filename = filename.replace('\\', '\\\\')
        insert_sql = sql.insert_softwareinfo(self.platform, hashurl, topic_url, updatedate, filename)
        status = handle_db(insert_sql)
        log.msg(str(status), log.DEBUG) 
        #更新topic数据库表
        insert_sql = sql.insert_topic_url(self.platform, topic_url,referer_url, updatedate, is_crawled, priority_rating)
        statusn = handle_db(insert_sql)
        log.msg(str(status), log.DEBUG)

        try:
            autocopy.copy(filename, self.platform)
            log.msg('copy job is successed', log.INFO)
        except:
            log.msg(str(traceback.print_exc()), log.ERROR)
            log.msg('copy job is failture', log.ERROR) 
示例#2
0
 def process_item(self, item, spider):
     #对传递过来的item进行解析
     topic_url = item['topic_url']
     referer_url = item['referer_url']
     spider_name = item['spider_name']
     update_date = time.strftime('%Y-%m-%d %H:%M:%S')
     is_crawled = 0
     priority_rating = 0
     sql = insert_topic_url(spider_name, topic_url,referer_url, update_date, is_crawled, priority_rating)
     n = handle_db(sql)
     print n
     return item
示例#3
0
 def process_item(self, item, spider):
     #对传递过来的item进行解析
     log.msg(str(item), log.DEBUG)
     try:
         topic_url = item['topic_url']
         referer_url = item['referer_url']
         spider_name = item['spider_name']
         update_date = time.strftime('%Y-%m-%d %H:%M:%S')
         is_crawled = 0
         priority_rating = 5
         sql = insert_topic_url(spider_name, topic_url,referer_url, update_date, is_crawled, priority_rating)
         n = handle_db(sql)
     except:
         log.msg(str(traceback.print_exc()), log.DEBUG)
     log.msg('insert a message' + str(n), log.DEBUG)
     return item
示例#4
0
    def get_apk(self, response):

        filename = ''.join([str(random.randrange(1,100000)), '.ipa'])
#        if os.path.exists(''.join([os.getcwd(), os.sep, 'hiapk', os.sep, 'download', os.sep, time.strftime('%Y-%m-%d'),os.sep])) is False:
#            os.mkdir(''.join([os.getcwd(), os.sep, 'hiapk', os.sep, 'download', os.sep, time.strftime('%Y-%m-%d'),os.sep]))
        #调用新建文件夹的方法
        down_dir = utils.make_spiderdir(self.platform, 'download')
        #解码:有的文件可能是utf-8编码,解为unicode
        try:
            filename = filename.decode('utf-8', 'ignore')
        except:
            pass
        filename = ''.join([down_dir, os.sep, str(time.time()).split('.')[0], filename])
        
        
        
        #下载后在存数据库,确保准确性
        topic_url = response.request.url
        hashurl = sql.hash_topic(topic_url)
        updatedate = time.strftime('%Y-%m-%d %H:%M:%S')
        filename = filename.replace('\\', '\\\\')
        #插入数据库主题页面
        insert_topic_sql = sql.insert_topic_url(self.name, topic_url, '', updatedate, '1', '0')
        topic_status = handle_db(insert_topic_sql)
        if topic_status['errorNo'] == -1:
            raise ValueError("this ipa file has download in my databases")
        open(filename, 'wb').write(response.body)
        insert_sql = sql.insert_softwareinfo(self.platform, hashurl, topic_url, updatedate, filename)
        status = handle_db(insert_sql)
        log.msg(str(status), log.DEBUG) 

        try:
            autocopy.copy(filename, self.platform)
            log.msg('copy job is successed', log.INFO)
        except:
            log.msg(str(traceback.print_exc()), log.ERROR)
            log.msg('copy job is failture', log.ERROR)
示例#5
0
 def get_downloadurl(self, response):
     '''
                 将主题获取的内容进行apk正则表达式解析,
                 如果根据下载地址去下载页面的化,现在不在可控范围,即会下很多无用包
     '''
     try:
         down_urls = re.findall('<a href="(.*?)">.*\.(apk)</a>', response.body)
         #含有下载附近的页面,插入topicdata表中,插入成功,表示该页面未抓取过,同时真实下载后更新之
         if any(down_urls):
             try:
                 update_time = time.strftime('%Y-%m-%d %H:%M:%S')
                 insert_topic_sql = sql.insert_topic_url(self.name, response.url, response.request.url, update_time, '0', '0')
                 log.msg(insert_topic_sql, log.INFO)
                 status = db.handle_db(insert_topic_sql)
                 if status['errorNo'] == 0:
                     log.msg('this url can be use', log.INFO)
                 else:
                     log.msg('this url cann\'t be used', log.INFO)
                     #return的关键作用是已经抓取过的主题页面不在抓取,保证不重复抓取软件包.
                     return
             except:
                 log.msg(str(traceback.print_exc()), log.INFO)
                 #预计发生唯一索引错误。即数据库中已经有该数据了
                 return
         for url in down_urls:
             print url
             filename = url[1]
             url = '%s%s' % ('http://forum.xda-developers.com/', url[0])
             #生成的url先放在topicdata表中,在每次重复抓一个主题页时,
             request = Request(url = url, callback = self.get_apk)
             #将topic_url存入到meta属性中
             request._meta = {'filename' : filename, 'topic_url' : response.url}
             yield request
     except:
         log.msg(str(traceback.print_exc()), log.INFO)
         return