forked from Khan/youtube-export
/
export.py
executable file
·130 lines (102 loc) · 4.47 KB
/
export.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/env python
# TODO(colin): fix these lint errors (http://pep8.readthedocs.io/en/release-1.7.x/intro.html#error-codes)
# pep8-disable:E128
import optparse
import sys
import s3
import zencode
import filelock
import util
logger = util.logger
class YouTubeExporter(object):
""" Convert our YouTube videos into downloadable formats.
1) Take a YouTube URL and download the video to s3.
2) Pass it through Zencoder to convert the video into various formats.
3) Zencoder places the converted content in a different spot on s3.
"""
@staticmethod
def convert_missing_downloads(max_videos, dryrun=False):
"""Download from YouTube and use Zencoder to start converting any
missing downloadable content into its appropriate downloadable format.
"""
videos_converted = 0
error_ids = []
# With this option, videos that are missing in the S3 converted
# bucket are converted. The API's download_urls is ignored.
logger.info("Searching for videos that are missing from S3")
formats_to_convert = s3.list_missing_converted_formats()
legacy_mp4_videos = s3.list_legacy_mp4_videos()
for youtube_id, missing_formats in formats_to_convert.iteritems():
if videos_converted >= max_videos:
logger.info("Stopping: max videos reached")
break
if "_DUP_" in youtube_id:
logger.info(
("Skipping video {0} as it has invalid DUP in youtube ID"
.format(youtube_id)))
continue
# We already know the formats are missing from S3.
formats_to_create = missing_formats
if (youtube_id in legacy_mp4_videos and
"mp4" in formats_to_create):
if dryrun:
logger.info(
"Skipping copy of legacy content due to dryrun")
else:
s3.copy_legacy_content_to_new_location(youtube_id)
formats_to_create.remove("mp4")
if len(formats_to_create) == 0:
continue
logger.info("Starting conversion of %s into formats %s" %
(youtube_id, ",".join(formats_to_create)))
if dryrun:
logger.info(
"Skipping downloading and sending job to zencoder due to "
"dryrun")
videos_converted += 1
else:
s3_source_url = s3.get_or_create_unconverted_source_url(
youtube_id)
if not s3_source_url:
logger.warning("No S3 source URL created for %s; skipping"
% youtube_id)
error_ids.append(youtube_id)
continue
try:
zencode.start_converting(youtube_id, s3_source_url,
formats_to_create)
videos_converted += 1
except Exception, why:
logger.error('Skipping youtube_id "%s": %s'
% (youtube_id, why))
error_ids.append(youtube_id)
return (videos_converted, error_ids)
def main():
parser = optparse.OptionParser()
parser.add_option("-n", "--no-log",
action="store_true", dest="nolog",
help="Log to stdout instead of to a log file", default=False)
parser.add_option("-m", "--max",
action="store", dest="max", type="int",
help="Maximum number of videos to process", default=1)
parser.add_option("-d", "--dryrun",
action="store_true", dest="dryrun",
help="Don't start new zencoder jobs or upload to s3",
default=False)
options, args = parser.parse_args()
util.setup_logging(options.nolog)
# Make sure only one youtube-export converter is running at a time.
with filelock.FileLock("export.lock", timeout=2):
(success, error_ids) = YouTubeExporter.convert_missing_downloads(
options.max, options.dryrun)
if error_ids:
msg = ('Skipped %d youtube-ids due to errors:\n%s\n'
% (len(error_ids), '\n'.join(sorted(error_ids))))
logger.warning(msg)
# Make this part of the stdout output as well, so it gets passed
# from cron to our email.
print msg
return (success, len(error_ids))
if __name__ == "__main__":
(_, errors) = main()
sys.exit(errors)