def parse_film_info(self, url): page_film = parse_webpage(url) video_code_re = '(?<=watch-)(\\w+-){0,2}\\w*\\d+' video_code = re.search(video_code_re, url) if video_code is None or page_film is None: return None video_code = video_code.group().upper() search_video_code = self.code_special_case(video_code) view_count_re = '<div class=\"film_view_count\".*?>\\d*</div>' view_count_str = re.search(view_count_re, page_film).group() view_count_str = re.sub('<.*?>', '', view_count_str) model_re = '<.*>Models:.*?>.*?>' model = re.search(model_re, page_film).group() model = re.sub('<.*?>', '', model) model = re.sub('Models: ', '', model) title_re = '<title>(.*)</title>' title = re.search(title_re, page_film).group() title = re.sub('<.*?>', '', title) img_url_re = '<img itemprop=\"image\" src=\"(.*?)\" title=\"' img_url = re.search(img_url_re, page_film).group(1) tag_re = '<li>Genre:\\s*(.*?)</li>' tag = re.search(tag_re, page_film).group(1) tag_re = '<a.*?>(.*?)</a>' tag = re.findall(tag_re, tag) tag = self.switch_tag(tag) print(tag) if self.mongo.info_is_exists(url): info = {} info['url'] = url info['count'] = int(view_count_str) info['update_date'] = datetime.datetime.now() info['tags'] = tag return info else: if search_video_code is not None: # filter some films don't have code number parse_indexav_obj = self.parse_indexav(search_video_code) if parse_indexav_obj['model'] is not None: model = parse_indexav_obj['model'] if parse_indexav_obj['video_title'] is not None: title = parse_indexav_obj['video_title'] info = {} info['code'] = video_code info['search_code'] = search_video_code info['url'] = url info['count'] = int(view_count_str) info['img_url'] = img_url info['models'] = model info['title'] = title info['update_date'] = datetime.datetime.now() info['tags'] = tag return info
def parse_film_info(self, url): page_film = parse_webpage(url) video_code_re = '(?<=watch-)(\w+-){0,2}\w*\d+' video_code = re.search(video_code_re, url) if video_code is None: return None video_code = video_code.group().upper() search_video_code = self.code_special_case(video_code) view_count_re = '<div class=\"film_view_count\".*?>\d*</div>' view_count_str = re.search(view_count_re, page_film).group() view_count_str = re.sub('<.*?>', '', view_count_str) model_re = '<.*>Models:.*?>.*?>' model = re.search(model_re, page_film).group() model = re.sub('<.*?>', '', model) model = re.sub('Models: ', '', model) title_re = '<title>(.*)</title>' title = re.search(title_re, page_film).group() title = re.sub('<.*?>', '', title) if search_video_code is not None: parse_indexav_obj = self.parse_indexav(search_video_code) if parse_indexav_obj['model'] is not None: model = parse_indexav_obj['model'] if parse_indexav_obj['video_title'] is not None: title = parse_indexav_obj['video_title'] img_url_re = '<img itemprop=\"image\" src=\"(.*?)\" title=\"' img_url = re.search(img_url_re, page_film).group(1) info = {} info['code'] = video_code info['search_code'] = search_video_code info['url'] = url info['count'] = int(view_count_str) info['models'] = model info['title'] = title info['img_url'] = img_url return info
def parse_indexav(self, video_code): page_indexav = parse_webpage('https://indexav.com/search?keyword=' + video_code) returnObj = {} model_re = '<span class=\"video_actor\".*?>(.*)</span>' model = re.search(model_re, page_indexav) if model is None: returnObj['model'] = None else: returnObj['model'] = re.sub('<.*?>', '', model.group()) video_title_re = '<span class=\"video_title\".*?>(.*)</span>' video_title = re.search(video_title_re, page_indexav) if video_title is None: returnObj['video_title'] = None else: returnObj['video_title'] = re.sub('<.*?>', '', video_title.group()) return returnObj
def parse_film_info(self, url): page_film = parse_webpage(url) video_code_re = '(?<=watch-)(\\w+-){0,2}\\w*\\d+' video_code = re.search(video_code_re, url) if video_code is None or page_film is None: return None video_code = video_code.group().upper() search_video_code = self.code_special_case(video_code) view_count_re = '<div class=\"film_view_count\".*?>\\d*</div>' view_count_str = re.search(view_count_re, page_film).group() view_count_str = re.sub('<.*?>', '', view_count_str) model_re = '<.*>Models:.*?>.*?>' model = re.search(model_re, page_film).group() model = re.sub('<.*?>', '', model) model = re.sub('Models: ', '', model) title_re = '<title>(.*)</title>' title = re.search(title_re, page_film).group() title = re.sub('<.*?>', '', title) img_url_re = '<img itemprop=\"image\" src=\"(.*?)\" title=\"' img_url = re.search(img_url_re, page_film).group(1) film_url = None film_url_re = '{file: (.*?)}' film_url_src = re.search(film_url_re, page_film) if film_url_src != None: if film_url_src.group(1).find('window') != -1: import base64 film_url_re = 'window.atob\\(\"(.*?)\"\\)' film_url = base64.b64decode( re.search(film_url_re, page_film).group(1)).decode('utf-8') else: film_url_re = '\"(.*?)\"' film_url = re.search(film_url_re, film_url_src.group(1)).group(1) print("film_url is {}".format(film_url)) if self.mongo.info_is_exists(url): info = {} info['url'] = url info['count'] = int(view_count_str) info['update_date'] = datetime.datetime.now() info['film_url'] = film_url return info else: if search_video_code is not None: # filter some films don't have code number parse_indexav_obj = self.parse_indexav(search_video_code) if parse_indexav_obj['model'] is not None: model = parse_indexav_obj['model'] if parse_indexav_obj['video_title'] is not None: title = parse_indexav_obj['video_title'] info = {} info['code'] = video_code info['search_code'] = search_video_code info['url'] = url info['count'] = int(view_count_str) info['img_url'] = img_url info['models'] = model info['title'] = title info['update_date'] = datetime.datetime.now() info['film_url'] = film_url return info