def get_apk(self, response): topic_url = referer_url = response.request.meta['referer_url'] update_date = time.strftime('%Y-m-%d %H:%M:%S') is_crawled = '1' priority_rating = '0' filename = ''.join([str(random.randrange(1,100000)), '.apk']) # if os.path.exists(''.join([os.getcwd(), os.sep, 'gfan', os.sep, 'download', os.sep, time.strftime('%Y-%m-%d'),os.sep])) is False: # os.mkdir(''.join([os.getcwd(), os.sep, 'gfan', os.sep, 'download', os.sep, time.strftime('%Y-%m-%d'),os.sep])) #调用新建文件夹的方法 down_dir = utils.make_spiderdir(self.platform, 'download') #解码:有的文件可能是utf-8编码,解为unicode try: filename = filename.decode('utf-8', 'ignore') except: pass filename = ''.join([down_dir, os.sep, str(time.time()).split('.')[0], filename]) open(filename, 'wb').write(response.body) #下载后在存数据库,确保准确性 hashurl = sql.hash_topic(topic_url) updatedate = time.strftime('%Y-%m-%d %H:%M:%S') filename = filename.replace('\\', '\\\\') insert_sql = sql.insert_softwareinfo(self.platform, hashurl, topic_url, updatedate, filename) status = handle_db(insert_sql) log.msg(str(status), log.DEBUG) #更新topic数据库表 insert_sql = sql.insert_topic_url(self.platform, topic_url,referer_url, updatedate, is_crawled, priority_rating) statusn = handle_db(insert_sql) log.msg(str(status), log.DEBUG) try: autocopy.copy(filename, self.platform) log.msg('copy job is successed', log.INFO) except: log.msg(str(traceback.print_exc()), log.ERROR) log.msg('copy job is failture', log.ERROR)
def process_item(self, item, spider): #对传递过来的item进行解析 topic_url = item['topic_url'] referer_url = item['referer_url'] spider_name = item['spider_name'] update_date = time.strftime('%Y-%m-%d %H:%M:%S') is_crawled = 0 priority_rating = 0 sql = insert_topic_url(spider_name, topic_url,referer_url, update_date, is_crawled, priority_rating) n = handle_db(sql) print n return item
def process_item(self, item, spider): #对传递过来的item进行解析 log.msg(str(item), log.DEBUG) try: topic_url = item['topic_url'] referer_url = item['referer_url'] spider_name = item['spider_name'] update_date = time.strftime('%Y-%m-%d %H:%M:%S') is_crawled = 0 priority_rating = 5 sql = insert_topic_url(spider_name, topic_url,referer_url, update_date, is_crawled, priority_rating) n = handle_db(sql) except: log.msg(str(traceback.print_exc()), log.DEBUG) log.msg('insert a message' + str(n), log.DEBUG) return item
def get_apk(self, response): filename = ''.join([str(random.randrange(1,100000)), '.ipa']) # if os.path.exists(''.join([os.getcwd(), os.sep, 'hiapk', os.sep, 'download', os.sep, time.strftime('%Y-%m-%d'),os.sep])) is False: # os.mkdir(''.join([os.getcwd(), os.sep, 'hiapk', os.sep, 'download', os.sep, time.strftime('%Y-%m-%d'),os.sep])) #调用新建文件夹的方法 down_dir = utils.make_spiderdir(self.platform, 'download') #解码:有的文件可能是utf-8编码,解为unicode try: filename = filename.decode('utf-8', 'ignore') except: pass filename = ''.join([down_dir, os.sep, str(time.time()).split('.')[0], filename]) #下载后在存数据库,确保准确性 topic_url = response.request.url hashurl = sql.hash_topic(topic_url) updatedate = time.strftime('%Y-%m-%d %H:%M:%S') filename = filename.replace('\\', '\\\\') #插入数据库主题页面 insert_topic_sql = sql.insert_topic_url(self.name, topic_url, '', updatedate, '1', '0') topic_status = handle_db(insert_topic_sql) if topic_status['errorNo'] == -1: raise ValueError("this ipa file has download in my databases") open(filename, 'wb').write(response.body) insert_sql = sql.insert_softwareinfo(self.platform, hashurl, topic_url, updatedate, filename) status = handle_db(insert_sql) log.msg(str(status), log.DEBUG) try: autocopy.copy(filename, self.platform) log.msg('copy job is successed', log.INFO) except: log.msg(str(traceback.print_exc()), log.ERROR) log.msg('copy job is failture', log.ERROR)
def get_downloadurl(self, response): ''' 将主题获取的内容进行apk正则表达式解析, 如果根据下载地址去下载页面的化,现在不在可控范围,即会下很多无用包 ''' try: down_urls = re.findall('<a href="(.*?)">.*\.(apk)</a>', response.body) #含有下载附近的页面,插入topicdata表中,插入成功,表示该页面未抓取过,同时真实下载后更新之 if any(down_urls): try: update_time = time.strftime('%Y-%m-%d %H:%M:%S') insert_topic_sql = sql.insert_topic_url(self.name, response.url, response.request.url, update_time, '0', '0') log.msg(insert_topic_sql, log.INFO) status = db.handle_db(insert_topic_sql) if status['errorNo'] == 0: log.msg('this url can be use', log.INFO) else: log.msg('this url cann\'t be used', log.INFO) #return的关键作用是已经抓取过的主题页面不在抓取,保证不重复抓取软件包. return except: log.msg(str(traceback.print_exc()), log.INFO) #预计发生唯一索引错误。即数据库中已经有该数据了 return for url in down_urls: print url filename = url[1] url = '%s%s' % ('http://forum.xda-developers.com/', url[0]) #生成的url先放在topicdata表中,在每次重复抓一个主题页时, request = Request(url = url, callback = self.get_apk) #将topic_url存入到meta属性中 request._meta = {'filename' : filename, 'topic_url' : response.url} yield request except: log.msg(str(traceback.print_exc()), log.INFO) return