def _fetch_pagelet_highlights(pagelet_num, max_days_ago): highlights = [] payload = "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"query[category]\"\r\n\r\n58,29,72,69,30,65,907,31,419,67,18,417,25,63,82,28,256,902\r\n" \ "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"query[count]\"\r\n\r\n15\r\n" \ "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"query[order_by]\"\r\n\r\ndate\r\n" \ "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"query[order]\"\r\n\r\nDESC\r\n" \ "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"query[style]\"\r\n\r\nlisting-classic\r\n" \ "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"query[show_excerpt]\"\r\n\r\n0\r\n" \ "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"query[cats-tags-condition]\"\r\n\r\nand\r\n" \ "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"query[cats-condition]\"\r\n\r\nin\r\n" \ "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"query[tags-condition]\"\r\n\r\nin\r\n" \ "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"query[featured_image]\"\r\n\r\n0\r\n" \ "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"query[ignore_sticky_posts]\"\r\n\r\n1\r\n" \ "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"query[disable_duplicate]\"\r\n\r\n0\r\n" \ "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"query[paginate]\"\r\n\r\nmore_btn\r\n" \ "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"query[pagination-show-label]\"\r\n\r\n0\r\n" \ "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"query[columns]\"\r\n\r\n3\r\n" \ "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"query[override-listing-settings]\"\r\n\r\n0\r\n" \ "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"query[_layout][state]\"\r\n\r\n1|1|1\r\n" \ "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"query[_layout][page]\"\r\n\r\n1-col\r\n" \ "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"view\"\r\n\r\nPublisher_Classic_Listing_1_Shortcode\r\n" \ "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"current_page\"\r\n\r\n" \ + str(pagelet_num) + "\r\n" \ "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"_bs_pagin_token\"\r\n\r\n2670529\r\n" \ "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"action\"\r\n\r\npagination_ajax\r\n" \ "------WebKitFormBoundary7MA4YWxkTrZu0gW--" headers = { 'content-type': "multipart/form-data; boundary=----WebKitFormBoundary7MA4YWxkTrZu0gW", 'cache-control': "no-cache", } page = requests.request("POST", ROOT_URL, data=payload, headers=headers) if page.text == 'INVALID TOKEN!': return [] html = json.loads(page.text)['output'] \ .replace("\n", "") \ .replace("\t", "") \ .replace("\\", "") soup = BeautifulSoup(html, 'html.parser') # Extract videos for vid in soup.find_all(class_='listing-inner'): # Extract match name match_name = str(vid.find(class_='title').find('a').get_text()) if not 'vs' in match_name: # Check that the highlight is for a match continue # Extract view count - NOT AVAILABLE for this website view_count = 0 # Extract category info = vid.find(class_='term-badge') if not info: continue category = str(info.find('a').get_text()) # Extract time since video added date = vid.find(class_='post-meta').find('time') if not date: continue now = datetime.now() time_since_added = str(date.get_text()) time_since_added_date = dateparser.parse(time_since_added).replace( hour=now.hour, minute=now.minute) time_since_added = str(time_since_added_date) # If error occur while parsing date, skip # TODO: handle case where date malformed (special string field) if not time_since_added_date: continue if not fetcher_footyroom.is_recent(time_since_added_date, max_days_ago): continue # Extract image link image = vid.find(class_='img-holder') if not image: continue img_link = image.get('data-src') # Extract link link_tag = vid.find(class_='img-holder').get('href') link = str(link_tag) if not _is_valid_link(link): continue video_links = _get_video_links(link) if not video_links: continue # Add multiple video links for type, link in video_links: highlights.append( SportyHLHighlight(link, match_name, img_link, view_count, category, time_since_added, type)) return highlights
def _fetch_pagelet_highlights(pagelet_num, max_days_ago): highlights = [] page = requests.get(ROOT_URL + PAGELET_EXTENSION + str(pagelet_num)) soup = BeautifulSoup(page.content, 'html.parser') # Extract videos for vid in soup.find_all(class_='vidthumb'): thumb = vid.find(class_='thumb') if not thumb: continue link = thumb.find('a') # Extract match name match_name = str(link.get('title')) if not 'vs' in match_name: # Check that the highlight is for a match continue # Extract view count video_info = vid.find(class_="count") view_count = 0 if video_info: count = video_info.get_text() view_count = int(float(count.replace('K', '')) * 1000) if 'K' in count else count # Extract category info = vid.find(class_='flecha') if not info: continue category = str(info.get_text()) # Extract time since video added date = vid.find(class_='time') if not date: continue now = datetime.now() time_since_added = str(date.get_text()) time_since_added_date = dateparser.parse(time_since_added).replace( hour=now.hour, minute=now.minute) time_since_added = str(time_since_added_date) # If error occur while parsing date, skip # TODO: handle case where date malformed (special string field) if not time_since_added_date: continue if not fetcher_footyroom.is_recent(time_since_added_date, max_days_ago): continue # Extract image link image = thumb.find('img') if not image: continue img_link = str(image.get('src')) # Extract link link = str(link.get('href')) if not _is_valid_link(link): continue # Get highlight page HTML page = requests.get(link) soup = BeautifulSoup(page.content, 'html.parser') video_links = _get_video_links(soup) if not video_links: continue score = _get_match_score(soup) try: goal_data = fetcher_score_ourmatch.get_goal_data(soup) except Exception: goal_data = [] # Add multiple video links for type, link in video_links: h = OurMatchHighlight(link, match_name, img_link, view_count, category, time_since_added, goal_data, type) if score: h.set_score(score[0], score[1]) highlights.append(h) return highlights
def _fetch_pagelet_highlights(pagelet_num, max_days_ago): highlights = [] page = requests.get(ROOT_URL) soup = BeautifulSoup(page.text, 'html.parser') # Extract videos for vid in soup.find_all(class_='td_module_1'): # Extract match name match_name = str(vid.find(class_='td-image-wrap').get('title')) if not 'bbc match of the day' in match_name.lower(): # Check that the highlight is for a match continue # Extract category info = vid.find(class_='td-post-category') if not info: continue category = str(info.get_text()) # Extract time since video added date = vid.find(class_='td-module-date') if not date: continue now = datetime.now() time_since_added = str(date.get_text()) time_since_added_date = dateparser.parse(time_since_added).replace( hour=now.hour, minute=now.minute) time_since_added = str(time_since_added_date) # If error occur while parsing date, skip # TODO: handle case where date malformed (special string field) if not time_since_added_date: continue if not fetcher_footyroom.is_recent(time_since_added_date, max_days_ago): continue # Extract image link image = vid.find(class_='td-image-wrap') if not image: continue style = image.find("span").get("style") regex = "background-image: url\((.*?)\)" search_result = re.compile(regex, 0).search(style) img_link = '' if search_result: img_link = search_result.groups()[0] # Extract link link_tag = vid.find(class_="td-image-wrap") link = str(link_tag.get("href")) if not _is_valid_link(link): continue video_links = _get_video_links(link) for type, video_link in video_links: highlights.append( HighlightsFootballHighlight(video_link, match_name, img_link, view_count, category, time_since_added, type)) return highlights
def _fetch_pagelet_highlights(pagelet_num, max_days_ago): highlights = [] page = requests.post(ROOT_URL, data={ 'action': 'td_ajax_block', 'block_type': 'td_block_3', 'td_current_page': pagelet_num + 1 }) html = json.loads(page.text)['td_data'] \ .replace("\n", "") \ .replace("\t", "") \ .replace("\\", "") soup = BeautifulSoup(html, 'html.parser') # Extract videos for vid in soup.find_all(class_='td_module_1'): # Extract match name match_name = str(vid.find('img').get('title')) if not 'vs' in match_name: # Check that the highlight is for a match continue # Extract view count - NOT AVAILABLE for this website view_count = 0 # Extract category info = vid.find(class_='td-post-category') if not info: continue category = str(info.get_text()) # Extract time since video added date = vid.find(class_='td-module-date') if not date: continue now = datetime.now() time_since_added = str(date.get_text()) time_since_added_date = dateparser.parse(time_since_added).replace(hour=now.hour, minute=now.minute) time_since_added = str(time_since_added_date) # If error occur while parsing date, skip # TODO: handle case where date malformed (special string field) if not time_since_added_date: continue if not fetcher_footyroom.is_recent(time_since_added_date, max_days_ago): continue # Extract image link image = vid.find('img') if not image: continue img_link = str(image.get("src")) # Extract link link_tag = vid.find("a") link = str(link_tag.get("href")) if not _is_valid_link(link): continue video_link = _get_video_link(link) if not video_link: continue highlights.append(HighlightsFootballHighlight(video_link, match_name, img_link, view_count, category, time_since_added)) return highlights
def _fetch_pagelet_highlights(pagelet_num, max_days_ago): highlights = [] page = PROXY.get(ROOT_URL + PAGELET_EXTENSION + str(pagelet_num)) soup = BeautifulSoup(page.content, 'html.parser') # Extract videos for vid in soup.find_all(id="cocog"): # Extract link link_tag = vid.find("a") link = str(link_tag.get("href")) if not _is_valid_link(link): continue full_link = _form_full_link(link) video_links = _get_video_links(full_link) if not video_links: continue # Extract image link image = link_tag.find("img") if not image: continue img_link = str(image.get("src")) # Extract match name match_name = str(image.get("alt")) if ' - ' not in match_name: continue # Extract view count - NOT AVAILABLE for this website view_count = 0 # Extract category info = vid.find(class_="info") if not info: continue info_img = info.find("img") if not info_img: continue category = str(info_img.get("alt")) # Extract time since video added info_font = info.find("font") if not info_font: continue time_since_added = str(info_font.get_text()) time_since_added_date = dateparser.parse(time_since_added) # If error occur while parsing date, skip # TODO: handle case where date malformed (special string field) if not time_since_added_date: continue if not fetcher_footyroom.is_recent(time_since_added_date, max_days_ago): continue for type, video_link in video_links: highlights.append(HoofootHighlight(video_link, match_name, img_link, view_count, category, time_since_added, type)) return highlights