def _parse_date_wsj(dateString): wsj_date_format_re = re.compile(u'(\d{4})/(\d{,2})/(\d{,2})[\s{0,2}|T](\d{,2}):(\d{,2}):(\d{,2})\s{,2}(\+|-)(\d{,2})(\d{2})') m = wsj_date_format_re.match(dateString) if not m: return w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zone0)s%(zone1)s:%(zone2)s' % \ {'year': m.group(1), 'month': m.group(2), 'day': m.group(3), \ 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6), \ 'zone0': m.group(7), 'zone1': m.group(8), 'zone2': m.group(9)} return feedparser._parse_date_w3dtf(w3dtfdate) pass
def scrape_player_page(video): """ Try to scrape the site for video and download. """ if not video['url'].startswith('http'): video['url'] = "http://www.svtplay.se" + video['url'] soup = BeautifulSoup(requests.get(video['url']).text) video_player = soup.body('a', {'data-json-href': True})[0] if 'oppetarkiv.se' in video['url']: flashvars = requests.get( "http://www.oppetarkiv.se/%s" % video_player.attrs['data-json-href'] + "?output=json").json() else: if video_player.attrs['data-json-href'].startswith("/wd"): flashvars = requests.get("http://www.svt.se/%s" % video_player.attrs['data-json-href']).json() else: flashvars = requests.get( "http://www.svtplay.se/%s" % video_player.attrs['data-json-href'] + "?output=json").json() video['duration'] = video_player.attrs.get('data-length', 0) if not 'title' in video: video['title'] = soup.find('meta', {'property': 'og:title'}).attrs['content'].replace('|', '_').replace('/', '_') if 'genre' not in video: if soup.find(text='Kategori:'): video['genre'] = soup.find(text='Kategori:').parent.parent.a.text else: video['genre'] = 'Ingen Genre' if 'dynamicStreams' in flashvars: video['url'] = flashvars['dynamicStreams'][0].split('url:')[1].split('.mp4,')[0] + '.mp4' filename = Path(video['title']).with_suffix(".mp4") print(Popen(["rtmpdump", "-o" + filename, "-r", video['url']], stdout=PIPE).communicate()[0]) if 'pathflv' in flashvars: rtmp = flashvars['pathflv'][0] filename = Path(video['title']).with_suffix(".flv") print(Popen(["mplayer", "-dumpstream", "-dumpfile", filename, rtmp], stdout=PIPE).communicate()[0]) if not 'timestamp' in video and soup.find_all(datetime=True): xmldate_str = soup.find_all(datetime=True)[0].attrs['datetime'] if xmldate_str: video['timestamp'] = datetime(*feedparser._parse_date_w3dtf(xmldate_str)[:6]) # naive in utc video['timestamp'] = video['timestamp'].replace(tzinfo=timezone.utc).astimezone(tz=None) # convert to local time if 'video' in flashvars: for reference in flashvars['video']['videoReferences']: if 'm3u8' in reference['url']: video['url'] = reference['url'] video['filename'] = Path(video['title']).with_suffix('.ts') if 'statistics' in flashvars: video['category'] = flashvars['statistics']['category'] if not download_from_playlist(video): return False if 'url' not in video: print("Could not find any streams") return False return video