/
scrape.py
56 lines (49 loc) · 1.78 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from bs4 import BeautifulSoup as bs
from pydub import AudioSegment
from pydub.silence import split_on_silence
from datetime import datetime
from pprint import pprint
import sys, json, time
from cStringIO import StringIO
from pycaption import DFXPReader, SRTWriter, CaptionConverter
doc = []
unique_id = 0
def getMsc(s):
l = s.split(':')
return int(l[0]) * 3600000 + int(l[1]) * 60000 + int(l[2].split('.')[0]) * 1000 + int(l[2].split('.')[1])
def toJson(captions):
global unique_id
caption_data = bs(open(captions + '.xml'))
song = AudioSegment.from_file(captions + '.mp4')
for segment in caption_data.find_all('p'):
original = segment.find('span').get_text().replace('!', ' ').replace(',', ' ').strip('[').strip(']').replace('.', ' ')
text = original.split(' ')
beginning = getMsc(segment.get('begin'))
ending = getMsc(segment.get('end'))
thing = song[beginning:ending]
for word in text:
new_segment = {}
index = text.index(word)
new_segment['id'] = unique_id
new_segment['begin'] = segment.get('begin')
new_segment['end'] = segment.get('end')
new_segment['caption_file'] = captions + '.srt'
new_segment['video_file'] = captions + '.mp4'
new_segment['word'] = word
new_segment['phrase'] = text
new_segment['phraseText'] = original
doc.append(new_segment)
unique_id = unique_id + 1
def toSrt(file_name):
converter = CaptionConverter()
transcript = open(file_name + '.xml')
new_transcript = transcript.read().replace('♪', '')
converter.read(unicode(new_transcript), DFXPReader())
f = open(file_name + '.srt', 'w')
string = converter.write(SRTWriter())
f.write(string)
f.close()
if __name__ == "__main__":
for line in sys.stdin.readlines():
toJson('episodes/' + line.rstrip('\n'))
print json.dumps(doc, indent=4, separators=(',', ': '))