def getAllPage(self): url = self.API_HOST + "?p=1" res = requests.get(url, headers=self.HEADER) res = res.text pages = re.search(r'<span>1.*?(\d*)</span>', res) self.API_PAGEALL = pages.group(1) logger.info(f'总共找到图片的页数{self.API_PAGEALL}')
def __init__(self): logger.info('spider360 模块启动中...') self.header = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.5603.400 QQBrowser/10.1.1775.400", } self.header1 = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Cache-Control": "no-cache", "Connection": "keep-alive", "Cookie": "__guid=92280206.530040451316176400.1579142607132.9788; Q=u%3D360H2548941901%26n%3D%26le%3D%26m%3DZGtmWGWOWGWOWGWOWGWOWGWOAQZ2%26qid%3D2548941901%26im%3D1_t01dbdd7e726a89e0ec%26src%3Dpcw_so_image_qq%26t%3D1; T=s%3Dcfec7fa03be89cefab2bd998153c13fc%26t%3D1579142992%26lm%3D%26lf%3D%26sk%3D2d7d2aa636fd90ec7e4053dc960da44b%26mt%3D1579142992%26rc%3D%26v%3D2.0%26a%3D1", # yapf: disabled "Host": "wallpaper.apc.360.cn", "Pragma": "no-cache", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36", } self.API_HOST = 'http://wallpaper.apc.360.cn/index.php?from=360chrome&c=WallPaper' self.API_TYPE = '&a=getAllCategoriesV2' self.API_NEW = '&a=getAppsByOrder&order=create_time&start=@parmas&count=@parmas' self.API_ARG = '&a=getAppsByCategory&cid=@parmas&start=@parmas&count=@parmas' self.dao = DBS() self.start()
def set_up(self): logger.info(f'爬虫进程:{self.list_lib}') for lib in self.list_lib: self.list_process.append(Thread(target=lib)) for task in self.list_process: task.setDaemon(True) task.start() for task in self.list_process: task.join()
def __init__(self): logger.info('爬虫模块启动...') # 计时开始 self.START_TIME = time() # 爬虫进程 self.list_process = [] # 相关爬虫模块 self.list_lib = self.filterSpider() # 启动爬虫进程 self.set_up()
def parseLink(self): for d in self.DATE_LIS: logger.info(f'获取{d}的数据') url = self.API_HOST + d res = requests.get(url, headers=self.HEADER) if res.status_code == 200: res.encoding = 'utf-8' self.parseJSON(res.text) sleep(self.INTERVAL) else: print(f"数据获取出错:{d}") continue
def __init__(self): logger.info('spiderJinShan 模块启动中...') self.HEADER = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.5603.400 QQBrowser/10.1.1775.400", } self.DAO = DBS() self.API_HOST = r'http://sentence.iciba.com/index.php?c=dailysentence&m=getdetail&title=' self.DATE_LIS = [] self.INTERVAL = 0.5 self.TOTAL = 0 self.TOTAL_EXIST = 0 self.start()
def __init__(self): logger.info('spiderBiYing 模块启动中...') self.HEADER = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.5603.400 QQBrowser/10.1.1775.400", } self.DAO = DBS() self.API_HOST = 'https://bing.ioliu.cn/' self.API_PAGE = 1 self.API_PAGEALL = None self.INTERVAL = 1 # 每次请求的间隔时间 self.TOTAL = 0 self.TOTAL_EXIST = 0 self.start()
def getAll(self): logger.info('开始获取360壁纸全部数据') url = self.API_HOST + self.API_TYPE res = self.getText(url, self.header) data = json.loads(res).get('data', '') totalALl, fetchedAll = 0, 0 if isinstance(data, list) and len(data) > 0: for cate in data: cid, category, start, count, fetched, total = cate['id'], cate[ 'name'], 0, 150, 0, 0 logger.debug(f"开始获取<{category}>数据") while True: # cid start count url = self.API_HOST + self.API_ARG.replace( '@parmas', str(cid), 1).replace( "@parmas", str(start), 1).replace( "@parmas", str(count)) res = self.getText(url, self.header) res = json.loads(res) total = int(res.get("total", 0)) lis = res.get('data', '') if isinstance(lis, list) and len(lis) > 0: for item in lis: print(f'fetching ---> {category}: {fetched}', end="\r") url = sub(r'__\d\d', "__100", item['url']) tag = item['tag'] reso = item['resolution'].replace("*", "x") result = self.dao.insertImage( url, tag, reso, category) if result is None: fetched += 1 if (fetched >= total): totalALl += total fetchedAll += fetched print( f"fetched ---> 分类:{category} 总计:{fetched}" ) break else: continue break logger.info(f'spider360--getAll运行结束,总共获取{fetchedAll}/{totalALl}张图片')
def __init__(self, **args): logger.info("数据库模块启动...") # load config config_path = os.path.abspath( os.path.join(os.path.dirname(__file__), "../../config/config.json")) # load config with open(config_path) as file: self.config = json.load(file) # init cnn try: self.cnn = pymysql.connect( host=self.config.get("db_host", ''), user=self.config.get("db_user", ''), password=self.config.get("db_passwd", ''), db=self.config.get("db_database", ""), charset=self.config.get("db_charset", ''), cursorclass=pymysql.cursors.DictCursor) except Exception: logger.error('数据库连接初始化失败,请检查配置文件') exit()
def getNEWS(self): logger.info('开始获取360壁纸最新数据。。。') start, page, fetched, total = 0, 30, 0, 0 # 循环获取最新 while True: url = self.API_NEW.replace("@parmas", str(start), 1).replace("@parmas", str(page)) res = self.getText(self.API_HOST + url, self.header) data = json.loads(res).get("data", '') if isinstance(data, list) and len(data) > 0: for item in data: print(f"fetching --> {total}", end='\r') url = sub(r'__\d\d', "__100", item['url']) tag = item['tag'] result = self.dao.insert(url, tag) if result is None: fetched += 1 total += 1 start += 1 else: break logger.info(f'spider360--getNews运行结束,总共获取{fetched}/{total}张图片')
def __del__(self): logger.info( f'SpiderJinShan爬取进程运行结束, 新获取数据:{self.TOTAL}, 过滤已存在:{self.TOTAL_EXIST}' )
def __del__(self): logger.info('SpiderBiYing爬取进程运行结束')
def __del__(self): self.cnn.close() logger.info('数据库连接关闭')
def __del__(self): during_time = time() - float(self.START_TIME) logger.info('爬虫模块运行结束, 耗时:%fs', during_time)
def __del__(self): # 后期再更改 logger.info('spider360爬取进程运行结束')