class Spider: def __init__(self): self.Mg = MongoUtil() def run(self, site_id=None): coll = self.Mg.getCol('site') res = coll.find() for site in res: # try: if 'data_type' in site and site['data_type'] == 'json': self.update_site_json(site) last_page = self.get_films_json(site) else: pass self.update_site(site, coll) last_page = self.get_films(site) coll.update_one({'_id': site['_id']}, {'$set': { 'last_page': last_page }}) # except: # continue # last_page = self.get_films(site) # coll.update_one({'_id': site['_id']}, {'$set': {'last_page': last_page}}) # self.fixClassify() # 过滤非法字符 def filter_xml(self, str): str = str.strip() for x in re_filters_xml: str = re.sub(x, '', str) return str # 过滤电影 def filter_film(self, name): if len(re.findall(re_filters_name, name)) > 0: return 1 return 0 # 更新总数和pagesize def update_site(self, site, coll): try: news = requests.get(site['api_url'], timeout=5) root = ET.fromstring(self.filter_xml(news.text)) except: return _ns = root.find('list').attrib tyClass = root.find('class') configs = [] for x in tyClass: configs.append({'ty': x.attrib['id'], 'key': x.text}) data = { 'last_recordcount': int(_ns['recordcount']), 'pagesize': int(_ns['pagesize']), 'pagecount': int(_ns['pagecount']), 'configs': configs } coll.update_one({'id': site['id']}, {"$set": data}) def set_classify(self, site, tid): for x, y in site['classify'].items(): if int(tid) in y: return int(x) return 0 # 获取列表 def get_films(self, _site): coll = self.Mg.getCol('films') siteColl = self.Mg.getCol('site') site = siteColl.find_one({'id': _site['id']}) # 补一页 page = site['last_page'] last = int(site['pagecount']) - site['last_page'] if int( site['pagecount']) - site['last_page'] >= 0 else 0 addNum = 0 updateNum = 0 sameNum = 0 tqIt = tqdm(range(last * site['pagesize']), total=last, desc=self.set_desc(site, page, addNum, updateNum, sameNum)) for x in tqIt: siteColl.update_one({'_id': site['_id']}, {'$set': { 'last_page': page }}) url = site['api_url'] + '?ac=videolist&pg=%d' % page page += 1 try: res = requests.get(url, timeout=20) except: return page try: root = ET.fromstring(res.text.strip()) except: continue if len(root.find('list')) == 0: return page for child in root.find('list'): last -= 1 film_name = child.find('name').text.upper() if child.find( 'name').text else '' note = child.find('note').text.upper() if child.find( 'note').text else '' video = [] for v in child.find('dl'): flag = v.attrib['flag'] if flag == '' or not v.text or len(list( v.text.split('#'))) == 0: continue video.append({ 'key': flag, 'plist': list(v.text.split('#')) }) if len(video) == 0: continue year = self.set_year(child.find('year').text) info = { 'film_name': film_name, 'tid': int(child.find('tid').text), 'pic': child.find('pic').text.strip() if child.find('pic').text else '', 'site_id': site['id'], 'state': int(self.filter_film(film_name)), 'des': child.find('des').text.strip() if child.find('des').text else '', 'type': child.find('type').text, 'area': child.find('area').text, 'year': int(year), 'douban_initial_year': int(year), 'classify': self.set_classify(site, int(child.find('tid').text)), 'note': note, 'video': video } if int(child.find('tid').text) not in site['filter_tid']: continue old = coll.find_one({'film_name': film_name, 'note': note}) if not old: coll.insert_one(info) addNum += 1 else: sameNum += 1 oldVideo = old['video'] if 'video' in old else [] coll.update_one( {'_id': old['_id']}, {'$set': { 'video': self.set_video(video, oldVideo) }}) # new_video_keys = list(map(lambda x: x['key'], video)) # old_video_keys = list(map(lambda x: x['key'], old['video'])) # difference = list(set(new_video_keys).difference(set(old_video_keys))) # for newKey in difference: # newPlist = list(filter(lambda x: x['key'] == newKey, video))[0] # coll.update_one({'_id': old['_id']}, {'$addToSet': {'video': newPlist}}) updateNum += 1 tqIt.set_description( desc=self.set_desc(site, page, addNum, updateNum, sameNum)) return page # 更新总数和pagesize def update_site_json(self, site): coll = self.Mg.getCol('site') news = requests.get(site['api_url'], timeout=5) news = news.json() tyClass = news['class'] configs = [] for x in tyClass: configs.append({'ty': x['type_id'], 'key': x['type_name']}) if 'total' in news: data = { 'last_recordcount': int(news['total']), 'pagesize': int(news['limit']), 'pagecount': int(news['pagecount']), 'configs': configs } else: page = news['page'] data = { 'last_recordcount': int(page['recordcount']), 'pagesize': int(page['pagesize']), 'pagecount': int(page['pagecount']), 'configs': configs } coll.update_one({'id': site['id']}, {"$set": data}) def get_films_json(self, _site): coll = self.Mg.getCol('films') siteColl = self.Mg.getCol('site') site = siteColl.find_one({'id': _site['id']}) # 补一页 page = site['last_page'] # 剩余页数 last = int(site['pagecount']) - site['last_page'] addNum = 0 updateNum = 0 sameNum = 0 tqIt = tqdm(range(last * site['pagesize']), desc=self.set_desc(site, page, addNum, updateNum, sameNum)) for x in tqIt: siteColl.update_one({'_id': site['_id']}, {'$set': { 'last_page': page }}) # 参数:p页码,翻页;wd关键词;cid分类id; 可以实现全部数据/分类数据获取还有搜索数据 url = site['api_url'] + '?ac=detail&pg=' + str(page) page += 1 try: res = requests.get(url, timeout=20) root = res.json() except: return page if len(root['list']) == 0: return page for child in root['list']: last -= 1 film_name = child['vod_name'].upper( ) if child['vod_name'] else '' note = child['vod_remarks'].upper( ) if child['vod_remarks'] else '' kkey = child['vod_play_from'].split( '$$$')[1] if child['vod_play_from'].find( '$$$') > -1 else child['vod_play_from'] video = [{ 'key': kkey, 'plist': child['vod_play_url'].split('#') }] if len(video) == 0 or int( child['type_id']) not in site['filter_tid']: continue year = self.set_year(child['vod_year']) info = { 'film_name': film_name, 'tid': child['type_id'], 'pic': child['vod_pic'].strip() if child['vod_pic'] else '', 'site_id': site['id'], 'state': int(self.filter_film(film_name)), 'des': child['vod_content'].strip() if child['vod_content'] else '', 'type': child['type_name'], 'area': child['vod_area'], 'year': int(year), 'douban_initial_year': int(year), 'classify': self.set_classify(site, int(child['type_id'])), 'note': note, 'video': video } old = coll.find_one({'film_name': film_name, 'note': note}) if not old: coll.insert_one(info) addNum += 1 else: sameNum += 1 oldVideo = old['video'] if 'video' in old else [] coll.update_one( {'_id': old['_id']}, {'$set': { 'video': self.set_video(video, oldVideo) }}) updateNum += 1 tqIt.set_description( desc=self.set_desc(site, page, addNum, updateNum, sameNum)) return page def set_year(self, old): old = re.sub(re.compile('普通话|年'), '', old) year = int(old) if old and 1970 < int(old) < 2021 else 1970 return year def set_video(self, newV, oldV): new_video_keys = list(map(lambda x: x['key'], newV)) for x in oldV: if x['key'] not in new_video_keys: newV.append(x) return newV def set_desc(self, site, page, addNum, updateNum, sameNum): return "[%s: %d, 总页数: %d, page: %d, 新增: %d, 修改: %d, 相同: %d]" % ( site['site_name'], site['id'], site['pagecount'], page, addNum, updateNum, sameNum) def fixClassify(self): baseClassifyDict = { "1": [1, 5, 6, 7, 8, 9, 10, 11, 20, 21, 22, 23, 24, 25, 26, 27, 29], "2": [2, 12, 13, 14, 15, 16, 17, 18, 48, 54, 34, 33], "3": [3, 45, 38, 37], "4": [4, 41, 47, 28, 39, 40], "5": [49, 51, 42, 43, 55, 19], "6": [60, 44, 43, 42, 30], "49": [49, 51, 52, 53, 55, 19] } coll = self.Mg.getCol('films') site_list = self.Mg.getCol('site').find() for site in site_list: if 'type_change' in site: for t in site['type_change']: coll.update_many( { 'site_id': site['id'], 'tid': int(t['ty']), 'new_tid': { "$exists": False } }, {'$set': { 'new_tid': int(t['change']) }}) for (k, v) in baseClassifyDict.items(): query = { "$or": [{ u"$and": [{ u"new_tid": { u"$exists": True } }, { u"new_tid": { u"$in": v } }] }, { u"$and": [{ u"new_tid": { u"$exists": False } }, { u"tid": { u"$in": v } }] }], "classify": { u"$exists": False } } coll.update_many(query, {'$set': {'classify': k}})
def get_site(): Mg = MongoUtil() siteColl = Mg.getCol('site') res = siteColl.find({'id': 8}) print(list(res)[0])
from utils.MongoUtil import MongoUtil Mg = MongoUtil() col = Mg.getCol("site") # site_config = {} # for x in col.find(): # site_config[x['id']] = x['configs'] # # print(site_config) classify = { '1': { '1': [1, 5, 6, 7, 8, 9, 10, 11], '2': [2, 12, 13, 14, 15, 16, 17, 18, 54], '3': [4, 41, 39, 40, 47], '49': [19, 49, 51, 52, 53, 55] }, '2': { '1': [1, 5, 6, 7, 8, 9, 10, 11, 24], '2': [2, 12, 13, 14, 15, 17, 18, 20], '3': [4, 41, 39, 40, 47], '49': [16, 19.21, 22] }, '3': { '1': [1, 5, 6, 7, 8, 9, 10, 11], '2': [2, 12, 13, 14, 15, 17, 18, 19, 21], '3': [4], '49': [16] }, '4': {
def insert_site(): ss = { 'id': 34, 'site_key': 'okzy', 'api_url': 'https://api.okzy.tv/api.php/provide/vod/at/json/?ac=detail', 'site_name': 'okzy', 'configs': [{ 'type_id': 1, 'type_name': '电影' }, { 'type_id': 2, 'type_name': '连续剧' }, { 'type_id': 3, 'type_name': '综艺' }, { 'type_id': 4, 'type_name': '动漫' }, { 'type_id': 5, 'type_name': '资讯' }, { 'type_id': 6, 'type_name': '动作片' }, { 'type_id': 7, 'type_name': '喜剧片' }, { 'type_id': 8, 'type_name': '爱情片' }, { 'type_id': 9, 'type_name': '科幻片' }, { 'type_id': 10, 'type_name': '恐怖片' }, { 'type_id': 11, 'type_name': '剧情片' }, { 'type_id': 12, 'type_name': '战争片' }, { 'type_id': 13, 'type_name': '国产剧' }, { 'type_id': 14, 'type_name': '香港剧' }, { 'type_id': 15, 'type_name': '韩国剧' }, { 'type_id': 16, 'type_name': '欧美剧' }, { 'type_id': 17, 'type_name': '公告' }, { 'type_id': 18, 'type_name': '头条' }, { 'type_id': 20, 'type_name': '纪录片' }, { 'type_id': 21, 'type_name': '微电影' }, { 'type_id': 22, 'type_name': '台湾剧' }, { 'type_id': 23, 'type_name': '日本剧' }, { 'type_id': 24, 'type_name': '海外剧' }, { 'type_id': 25, 'type_name': '内地综艺' }, { 'type_id': 26, 'type_name': '港台综艺' }, { 'type_id': 27, 'type_name': '日韩综艺' }, { 'type_id': 28, 'type_name': '欧美综艺' }, { 'type_id': 29, 'type_name': '国产动漫' }, { 'type_id': 30, 'type_name': '日韩动漫' }, { 'type_id': 31, 'type_name': '欧美动漫' }, { 'type_id': 32, 'type_name': '港台动漫' }, { 'type_id': 33, 'type_name': '海外动漫' }, { 'type_id': 34, 'type_name': '福利片' }, { 'type_id': 35, 'type_name': '解说' }, { 'type_id': 36, 'type_name': '电影解说' }, { 'type_id': 37, 'type_name': '伦理片' }], 'data_type': 'json', 'filter_tid': [3, 5, 17, 18, 20, 21, 22, 25, 26, 27, 28, 35, 36] } Mg = MongoUtil() siteColl = Mg.getCol('site') siteColl.insert_one(ss)