def final(silent_skip: bool, process_: bool): """Generates the latest application static data.json file, used by the backend API.""" descriptions = load_file(os.path.join(DATA_DIR, 'descriptions.json'), True) seasons = [{'season_id': season, 'episodes': []} for season in range(1, 10)] if process_: logger.info('Processing before building final.json') try: process(["--all"]) except: pass for season_id, episode_id in get_episodes(): # Load data file try: episode_data = load_file(get_filepath(season_id, episode_id, 'processed'), True) except FileNotFoundError: if not silent_skip: logger.warning( f'No data for Season {season_id}, Episode {episode_id} available. Null data inserted.') episode_data = None description = descriptions[season_id - 1][episode_id - 1] seasons[season_id - 1]['episodes'].append( { 'title': description['title'].strip(), 'description': description['description'].strip(), 'episode_id': episode_id, 'characters': get_appearances(season_id, episode_id), 'scenes': episode_data } ) logger.info('Saving to data.json') save_file(os.path.join(DATA_DIR, 'data.json'), seasons, True)
def fetch(season: int, episode: int, delay: float, all: bool, overwrite: bool, silent_skip: bool): """ Downloads raw quote pages from 'officequotes.net'. Fetches quote pages, placing them in 'html' folder in unmodified UTF-8 HTML files. """ episodes: List[Tuple[int, int]] if all: episodes = list(get_episodes()) elif season: if episode: if verify_episode(season, episode): episodes = [(season, episode)] else: logger.error(f'Season {season}, Episode {episode} is not a valid combination.') return else: episodes = list(get_episodes(season=season)) logger.info(f'Fetching Season {season}...') else: if episode: logger.info('You must specify more than just an episode.') else: logger.info('You must specify which episodes to fetch.') logger.info('Check --help for more information on this command.') return logger.debug(f'Ready to start fetching {len(episodes)} quote page{"s" if len(episodes) > 1 else ""}') session = requests.Session() last_request = time.time() - delay with enlighten.Manager() as manager: with manager.counter(total=len(episodes), desc='Fetching...', unit='episodes') as pbar: for _season, _episode in episodes: filepath = get_filepath(_season, _episode, 'html') # Check if HTML file exists if not overwrite and os.path.exists(filepath): if not silent_skip: logger.debug(f'Skipping Season {_season}, Episode {_episode}: File already exists.') else: logger.info(f'Fetching Season {_season}, Episode {_episode}...') # Generate link, make request link = f"http://officequotes.net/no{_season}-{str(_episode).zfill(2)}.php" sleep_from(delay, last_request, manager) # Sleep at least :delay: seconds. resp = session.get(link) last_request = time.time() if resp.ok: # Write data to file save_file(filepath, resp.text, False) logger.debug('Successfully fetched & saved.') else: logger.error(f'Fetching failed. Erroneous response code {resp.status_code}.') pbar.update() logger.info('Fetching complete.')
def algolia(silent_skip: bool, process_: bool): """ Generates algolia.json, a all encompassing file for Algolia's search index. """ if process_: logger.info('Processing before building algolia.json') try: process(["--all", '--silent']) except: pass data = [] episode_num_abs, section_num_abs, quote_num_abs = 0, 0, 0 for season, episode in get_episodes(): episode_num_abs += 1 try: episode_data = load_file(get_filepath(season, episode, 'processed'), True) except FileNotFoundError: if not silent_skip: logger.warning(f'Skipping Season {season}, Episode {episode}. No episode data file found.') else: for section_num_rel, section in enumerate(episode_data, start=1): section_num_abs += 1 for quote_num_rel, quote in enumerate(section['quotes'], start=1): quote_num_abs += 1 # Relative position quote['quote_rel'] = quote_num_rel quote['section_rel'] = section_num_rel quote['episode_rel'] = episode # Absolute position quote['quote_abs'] = quote_num_abs quote['section_abs'] = section_num_abs quote['episode_abs'] = episode_num_abs quote['season'] = season quote['is_deleted'] = 'deleted' in section.keys() quote['deleted_section'] = section.get('deleted') data.append(quote) logger.info(f'Saving {len(data):,} quotes to algolia.json') save_file(os.path.join(DATA_DIR, 'algolia.json'), data, True)
def process(season: Optional[int], episode: Optional[int], all_: bool, report: bool): """ Processes manually processed raw quote data into JSON. """ episodes: List[Tuple[int, int]] if all_: episodes = list(get_episodes()) elif season: if episode: if verify_episode(season, episode): episodes = [(season, episode)] else: logger.error(f'Season {season}, Episode {episode} is not a valid combination.') return else: episodes = list(get_episodes(season=season)) logger.info(f'Processing Season {season}...') else: if episode: logger.info('You must specify more than just an episode.') else: logger.info('You must specify which episodes to process.') logger.info('Check --help for more information on this command.') return quote: Union[str, List[str]] section_num: int for _season, _episode in episodes: sections = [] try: preprocessed_data = load_file(get_filepath(_season, _episode, 'raw')) for section_num, raw_section in enumerate(re.split('^-', preprocessed_data, flags=re.MULTILINE), start=1): section = { 'quotes': [] } section_data = list(raw_section.strip().split('\n')) if section_data[0].startswith('!'): section['deleted'] = int(re.search('!(\d+)', section_data.pop(0)).group(1)) for quote in section_data: quote = quote.split('|', 1) section['quotes'].append( { 'speaker': quote[0], 'text': quote[1] } ) sections.append(section) except FileNotFoundError: logger.info(f'Skipped Season {_season}, Episode {_episode}, no file found.') continue except: logger.exception(f'Skipped Season {_season}, Episode {_episode}: Malformed data.') if quote: logger.info( f'Last quote seen "{quote if type(quote) is str else "|".join(quote)}" in section {section_num}') else: # Save processed data save_file(get_filepath(_season, _episode, 'processed'), sections, True) if report: deleted_count = [0, set()] quote_count = 0 speakers = set() for section in sections: quote_count += len(section['quotes']) if 'deleted' in section.keys(): deleted_count[0] += 1 deleted_count[1].add(section['deleted']) for quote in section['quotes']: speakers.add(quote['speaker']) logger.debug(f'{quote_count} quotes.') logger.debug(f'{deleted_count[0]} different deleted sections, {len(deleted_count[1])} unique.') logger.info(f'{len(speakers)} Speakers:') logger.info(', '.join(speakers))
def preprocess(season: int, episode: int, all: bool, overwrite: bool, silent_skip: bool, silent_skip_missing: bool, silent_skip_existing: bool): """ Pre-processes raw HTML files into mangled custom quote data. Custom quote data requires manual inspection and formatting, making it a dangerous operation that may overwrite precious quote data. """ print(silent_skip_existing) episodes: List[Tuple[int, int]] if all: episodes = list(get_episodes()) elif season: if episode: if verify_episode(season, episode): episodes = [(season, episode)] else: logger.error(f'Season {season}, Episode {episode} is not a valid combination.') return else: episodes = list(get_episodes(season=season)) logger.info(f'Preprocessing Season {season}...') else: if episode: logger.info('You must specify more than just an episode.') else: logger.info('You must specify which episodes to pre-process.') logger.info('Check --help for more information on this command.') return for season, episode in episodes: # Overwrite protection save_path = get_filepath(season, episode, 'raw') if os.path.exists(save_path) and not overwrite: if (not silent_skip) or (not silent_skip_existing): logger.info(f'Skipping Season {season}, Episode {episode}, file already exists. Skipping processing.') continue try: page_data = load_file(get_filepath(season, episode, 'html'), False) except FileNotFoundError: if not silent_skip or not silent_skip_missing: logger.warning(f'No data for Season {season}, Episode {episode} available. Skipping processing.') else: soup = BeautifulSoup(page_data, "html.parser") data = [] sections = soup.find_all(attrs={"class": "quote"}) for section in sections: for br in section.find_all('br'): br.replace_with("\n" + br.text) for line in section.get_text().split('\n'): data.append(line.strip()) data.append('-') data.pop(-1) data = '\n'.join(data) save_file(save_path, data, False)