def __setHeader(self): t = Terminal() h = "\n" h += t.bold_yellow("Initializing: ") + t.bold_white(self.name + "\n") h += t.bold_yellow("Author: ") + t.bold_white(self.author + "\n") h += t.bold_yellow("Version: ") + t.bold_white(self.version + "\n") h += t.bold_yellow("Info: ") + t.bold_white(self.info) print h
headers.append('start_date') headers.append('end_date') for result in results: result['show'] = result['show'].replace(' – R', '').replace(' –R', '').replace(' -R', '') result['start_date'] = start_date result['end_date'] = end_date return headers, results with open('data/entries.json') as f: entries = json.load(f) all_results = [] for entry in entries: print(t.bold_white(entry['title'])) print(entry['link']) if 'Late-night ratings, ' not in entry['title']: print('Invalid title, skipping...') continue assert len(entry['content']) == 1 doc = lxml.html.fromstring(entry['content'][0]['value']) if not doc.cssselect('table'): print(t.yellow('No table found, skipping...')) continue table = doc.cssselect('table')[0] try: headers, results = parse_table(entry['title'], table) except Exception as e: print(e) print(t.yellow('Error parsing table, skipping...'))
"remaining_transcript": remaining_transcript } for filename in os.listdir("data/raw"): with open(f"data/transcripts/raw/{filename}") as f: item = json.load(f) date, title, url = item['date'], item['title'], item['url'] print(t.magenta(f"Parsing {date} {title}")) print(url) html = item.pop('html') parsed = parse_transcript(html) print(t.bold_white("Full Transcript:")) print(parsed['full_transcript']) print(t.bold_white("Video Clips:")) for i, video_clip in enumerate(parsed['video_clips']): print(f"Video Clip: {i+1}") print(video_clip) print(t.bold_white("Remaining Transcript:")) print(parsed['remaining_transcript']) print("----------------------------------------------------") output = {**item, **parsed} with open(f"data/transcripts/parsed/{filename}", "w") as f: