Exemplo n.º 1
0
 def __setHeader(self):
     t = Terminal()
     h = "\n"
     h += t.bold_yellow("Initializing: ") + t.bold_white(self.name + "\n")
     h += t.bold_yellow("Author: ") + t.bold_white(self.author + "\n")
     h += t.bold_yellow("Version: ") + t.bold_white(self.version + "\n")
     h += t.bold_yellow("Info: ") + t.bold_white(self.info)
     print h
    headers.append('start_date')
    headers.append('end_date')
    for result in results:
        result['show'] = result['show'].replace(' – R', '').replace(' –R', '').replace(' -R', '')
        result['start_date'] = start_date
        result['end_date'] = end_date

    return headers, results

with open('data/entries.json') as f:
    entries = json.load(f)

all_results = []

for entry in entries:
    print(t.bold_white(entry['title']))
    print(entry['link'])
    if 'Late-night ratings, ' not in entry['title']:
        print('Invalid title, skipping...')
        continue
    assert len(entry['content']) == 1
    doc = lxml.html.fromstring(entry['content'][0]['value'])
    if not doc.cssselect('table'):
        print(t.yellow('No table found, skipping...'))
        continue
    table = doc.cssselect('table')[0]
    try:
        headers, results = parse_table(entry['title'], table)
    except Exception as e:
        print(e)
        print(t.yellow('Error parsing table, skipping...'))
        "remaining_transcript": remaining_transcript
    }

for filename in os.listdir("data/raw"):

    with open(f"data/transcripts/raw/{filename}") as f:
        item = json.load(f)

    date, title, url = item['date'], item['title'], item['url']
    print(t.magenta(f"Parsing {date} {title}"))
    print(url)

    html = item.pop('html')
    parsed = parse_transcript(html)

    print(t.bold_white("Full Transcript:"))
    print(parsed['full_transcript'])

    print(t.bold_white("Video Clips:"))
    for i, video_clip in enumerate(parsed['video_clips']):
        print(f"Video Clip: {i+1}")
        print(video_clip)

    print(t.bold_white("Remaining Transcript:"))
    print(parsed['remaining_transcript'])

    print("----------------------------------------------------")

    output = {**item, **parsed}

    with open(f"data/transcripts/parsed/{filename}", "w") as f: