Пример #1
0
def _parse_date_wsj(dateString):
  wsj_date_format_re = re.compile(u'(\d{4})/(\d{,2})/(\d{,2})[\s{0,2}|T](\d{,2}):(\d{,2}):(\d{,2})\s{,2}(\+|-)(\d{,2})(\d{2})')
  m = wsj_date_format_re.match(dateString)
  if not m: return
  w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zone0)s%(zone1)s:%(zone2)s' % \
              {'year': m.group(1), 'month': m.group(2), 'day': m.group(3), \
               'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6), \
               'zone0': m.group(7), 'zone1': m.group(8), 'zone2': m.group(9)}
  return feedparser._parse_date_w3dtf(w3dtfdate)
  pass
Пример #2
0
def scrape_player_page(video):
    """
    Try to scrape the site for video and download. 
    """
    if not video['url'].startswith('http'):
        video['url'] = "http://www.svtplay.se" + video['url']
    soup = BeautifulSoup(requests.get(video['url']).text)
    video_player = soup.body('a', {'data-json-href': True})[0]
    if 'oppetarkiv.se' in video['url']:
        flashvars = requests.get(
                "http://www.oppetarkiv.se/%s" % video_player.attrs['data-json-href'] + "?output=json").json()
    else:
        if video_player.attrs['data-json-href'].startswith("/wd"):
            flashvars = requests.get("http://www.svt.se/%s" % video_player.attrs['data-json-href']).json()
        else:
            flashvars = requests.get(
                    "http://www.svtplay.se/%s" % video_player.attrs['data-json-href'] + "?output=json").json()
    video['duration'] = video_player.attrs.get('data-length', 0)
    if not 'title' in video:
        video['title'] = soup.find('meta', {'property': 'og:title'}).attrs['content'].replace('|', '_').replace('/', '_')
    if 'genre' not in video:
        if soup.find(text='Kategori:'):
            video['genre'] = soup.find(text='Kategori:').parent.parent.a.text
        else:
            video['genre'] = 'Ingen Genre'
    if 'dynamicStreams' in flashvars:
        video['url'] = flashvars['dynamicStreams'][0].split('url:')[1].split('.mp4,')[0] + '.mp4'
        filename = Path(video['title']).with_suffix(".mp4")
        print(Popen(["rtmpdump", "-o" + filename, "-r", video['url']], stdout=PIPE).communicate()[0])
    if 'pathflv' in flashvars:
        rtmp = flashvars['pathflv'][0]
        filename = Path(video['title']).with_suffix(".flv")
        print(Popen(["mplayer", "-dumpstream", "-dumpfile", filename, rtmp], stdout=PIPE).communicate()[0])
    if not 'timestamp' in video and soup.find_all(datetime=True):
        xmldate_str = soup.find_all(datetime=True)[0].attrs['datetime']
        if xmldate_str:
            video['timestamp'] = datetime(*feedparser._parse_date_w3dtf(xmldate_str)[:6])  # naive in utc
            video['timestamp'] = video['timestamp'].replace(tzinfo=timezone.utc).astimezone(tz=None)  # convert to local time
    if 'video' in flashvars:
        for reference in flashvars['video']['videoReferences']:
            if 'm3u8' in reference['url']:
                video['url'] = reference['url']
                video['filename'] = Path(video['title']).with_suffix('.ts')
                if 'statistics' in flashvars:
                    video['category'] = flashvars['statistics']['category']
        if not download_from_playlist(video):
            return False
    if 'url' not in video:
        print("Could not find any streams")
        return False
    return video