def get_soup_from_wiki_search(record: Record, search_url: str) -> BeautifulSoup: search_soup = make_soup(search_url) if 'search=' not in search_url: return search_soup results = search_soup.findAll('li', class_='mw-search-result') for result in results: if record.title.lower() in result.find('a').text.lower(): url = en_wiki_base if 'en.' in search_url else nl_wiki_base url += result.find('a')['href'] return make_soup(url)
def get_page_url_from_title(title: str) -> str: search_query = quote(title) title = title.lower() soup = make_soup(imdb_find_url + search_query) if soup.find('div', class_='findNoResults') is not None: return '' s = None for search in soup.findAll('div', class_='findSection'): if 'Titles' in search.find('h3').text: s = search break if s: titles = [x for x in s.find('table').findAll('tr')] if titles: match = list(filter( lambda x: title in x.find('td', class_='result_text').find('a').text.strip().lower(), titles)) if len(match): title = match[0] matching_url = imdb_base_url + title.find('a')['href'].split("?")[0] print("Parsing IMDB page: " + matching_url) return matching_url
def get_movie_row(title, cast=None) -> Optional[MovieRow]: matching_url = get_page_url_from_title(title) if not matching_url: return None soup = make_soup(matching_url) mr = MovieRow(title) try: mr.yop = get_year_of_production_from_soup(soup) except Exception as e: print("Failed to parse Year of Production from movie page (IMDB)") if not cast or not len(cast): try: mr.cast = get_cast_list_string(soup) print("Set cast: " + mr.cast) except Exception as e: print("Failed to parse cast from movie page (IMDB)") else: mr.cast = cast try: mr.language = get_language_from_soup(soup) except Exception as e: print("Failed to parse broadcast language from movie page") return mr
def add_nl_info(record) -> bool: try: url = nl_wiki_base + title_to_search(record.title) parsed = parse_wiki_page_nl(record, make_soup(url)) if parsed: print(f"Added schedule info from (NL Wiki) : {url} ") return parsed except URLError as e: print("No matching NL wiki page")
def add_en_info(record) -> bool: try: url = en_wiki_base + title_to_search(str(record.title)) soup = make_soup(url) if soup is not None: parsed = parse_wiki_page_en(record, soup) if parsed: print(f"Added schedule info from (EN Wiki) : {url} ") return parsed except URLError as e: print("No matching EN wiki page")
def add_schedule_info_to_record(record: Record): # Return if page was found & information added imdb_page_url = IMDB.get_page_url_from_title(record.title) parsed = False if imdb_page_url: page_soup = make_soup(imdb_page_url) IMDB.add_schedule_info_to_record(record, page_soup) parsed = True if not parsed: parsed = WIKI.add_en_info(record) if not parsed: WIKI.add_nl_info(record)
def get_summary_detail_row(title, episode, season) -> Optional[SeriesDetailRow]: record_url = get_page_url_from_title(title) if not record_url: return None ep_guide_url = record_url + f'episodes?season={season}' print("Season URL: " + ep_guide_url) try: season_soup = make_soup(ep_guide_url) if season_soup.find('a', text='TV Episodes submission guide') is not None: return None except Exception as e: return None try: if int(season) > len(season_soup.find('select', id='bySeason').findAll('option')): print(f"Season {season} of {title} not visible (IMDB)") return None except Exception as e: return None season_year = ''
season_year = '' s try: first_ep = season_soup.find('div', class_=list_item_regex) season_year = int(first_ep.find('div', class_='airdate').text.strip().split(" ")[-1]) except Exception as e: pass ep_soup = None try: print("Searching " + ep_guide_url + " For season: " + season + " episode: " + episode) episode_list = season_soup.find('div', class_='list detail eplist').findAll('div', class_=list_item_regex) episode_box = episode_list[int(episode) - 1] ep_soup = make_soup(imdb_base_url + episode_box.find('a')['href']) except IndexError as e: print(f"Episode {episode} not visible for title {title} : {e}") except (TypeError, AttributeError) as e: print(f"Error adding info from IMDB for record with title: {title} for ep {episode} season {season} ") print("Exception: " + str(e)) return get_series_summary_from_imdb_page_soup(ep_soup=ep_soup, episode=episode, season=season, title=title, season_year=season_year) def get_series_summary_from_imdb_page_soup(ep_soup, title, episode, season, season_year) -> SeriesDetailRow: s_row = SeriesDetailRow() s_row.title = title s_row.episode = episode