def process_episode(data): e = Episode() e.uri = data['uri'] if 'title' in data: e.title = data['title'] if 'description' in data: e.summary = data['description'] if 'episode_number' in data: e.number = data['episode_number'] if 'genres' in data: e.genres = get_genres(data['genres']) # Media type (ignore virtual entries) if 'schedule_only' not in data or not data['schedule_only']: if 'media_type' in data: e.media = data['media_type'] # Brand/Series c_uri = None s_uri = None if 'container' in data and 'uri' in data['container']: c_uri = data['container']['uri'] if 'series_summary' in data and 'uri' in data['series_summary']: s_uri = data['series_summary']['uri'] if c_uri and c_uri != s_uri: e.brand = get_brand(c_uri, data['container']) if s_uri: e.series = get_series(s_uri, data['series_summary']) # complete the link if e.series and e.brand: e.series.brand = e.brand # Film? if 'specialization' in data: e.film = data['specialization'] == 'film' if 'year' in data: e.year = int(data['year']) # Black and White? if 'black_and_white' in data: e.baw = data['black_and_white'] # People if 'people' in data: e.credits = process_people(data['people']) # Title if e.title: r = re.search('^Episode (\d+)$', e.title) if r: e.title = None if e.number is None: e.number = util.str2num(r.group(1)) elif re.search('^\d+/\d+/\d+$', e.title): e.title = None # OK ret = e log.debug('episode = %s' % e, 5) return e
def process_episode ( data ): e = Episode() e.uri = data['uri'] if 'title' in data: e.title = data['title'] if 'description' in data: e.summary = data['description'] if 'episode_number' in data: e.number = data['episode_number'] if 'genres' in data: e.genres = get_genres(data['genres']) # Media type (ignore virtual entries) if 'schedule_only' not in data or not data['schedule_only']: if 'media_type' in data: e.media = data['media_type'] # Brand/Series c_uri = None s_uri = None if 'container' in data and 'uri' in data['container']: c_uri = data['container']['uri'] if 'series_summary' in data and 'uri' in data['series_summary']: s_uri = data['series_summary']['uri'] if c_uri and c_uri != s_uri: e.brand = get_brand(c_uri, data['container']) if s_uri: e.series = get_series(s_uri, data['series_summary']) # complete the link if e.series and e.brand: e.series.brand = e.brand # Film? if 'specialization' in data: e.film = data['specialization'] == 'film' if 'year' in data: e.year = int(data['year']) # Black and White? if 'black_and_white' in data: e.baw = data['black_and_white'] # People if 'people' in data: e.credits = process_people(data['people']) # Title if e.title: r = re.search('^Episode (\d+)$', e.title) if r: e.title = None if e.number is None: e.number = util.str2num(r.group(1)) elif re.search('^\d+/\d+/\d+$', e.title): e.title = None # OK ret = e log.debug('episode = %s' % e, 5) return e
def get_channel ( uri, data ): log.debug('get_channel(%s)' % uri, 4) # Check cache ret = cache.get_channel(uri) # Process if ret is None: ret = process_channel(data) # Cache if ret: cache.put_channel(uri, ret) return ret
def get_channel(uri, data): log.debug('get_channel(%s)' % uri, 4) # Check cache ret = cache.get_channel(uri) # Process if ret is None: ret = process_channel(data) # Cache if ret: cache.put_channel(uri, ret) return ret
def finish ( self ): def tsort ( x, y ): r = cmp(x.start, y.start) if not r: r = cmp(x.stop, y.stop) return r for c in self.schedule: self.schedule[c].sort(cmp=tsort) p = None for i in self.schedule[c]: if p and p.stop > i.start: log.debug('epg - schedule overlap detected') log.debug('epg - assume multi-provider discrepancy, will correct') p.stop = i.start p = i
def publisher_overlay(a, b, pubs): ignore_keys = conf.get('atlas_overlay_ignore', ['uri']) #, 'transmission_end_time' ]) pa = a['publisher']['key'] pb = b['publisher']['key'] ia = -1 ib = -1 try: ia = pubs.index(pa) except: pass try: ib = pubs.index(pb) except: pass def _overlay(a, b): if type(b) == dict: for k in b: if k not in a: a[k] = b[k] elif k not in ignore_keys: a[k] = _overlay(a[k], b[k]) return a elif type(b) == list: for i in range(len(b)): if i < len(a): a[i] = _overlay(a[i], b[i]) else: a.append(b[i]) return a else: return b ret = None if ib < ia: t = a a = b b = t args = (a['uri'], a['broadcasts'][0]['transmission_time'].strftime('%H:%M'), a['broadcasts'][0]['transmission_end_time'].strftime('%H:%M'), b['uri'], b['broadcasts'][0]['transmission_time'].strftime('%H:%M'), b['broadcasts'][0]['transmission_end_time'].strftime('%H:%M')) log.debug('overlay %s @ %s-%s with %s @ %s-%s' % args, 6) ret = _overlay(a, b) return ret
def finish(self): def tsort(x, y): r = cmp(x.start, y.start) if not r: r = cmp(x.stop, y.stop) return r for c in self.schedule: self.schedule[c].sort(cmp=tsort) p = None for i in self.schedule[c]: if p and p.stop > i.start: log.debug('epg - schedule overlap detected') log.debug( 'epg - assume multi-provider discrepancy, will correct' ) p.stop = i.start p = i
def get_brand ( uri, data = None ): log.debug('get_brand(%s)' % uri, 4) # Check the cache ret = cache.get_brand(uri) # Get remote if ret is None: try: if not data or data.keys() == ['uri'] : data = get_content(uri, 'brand') if data: ret = process_brand(data) except: pass # Put in cache if ret: cache.put_brand(uri, ret) return ret
def get_series ( uri, data = None ): log.debug('get_series(%s)' % uri, 4) # Check cache ret = cache.get_series(uri) # Get remote if ret is None: try: if not data or data.keys() == [ 'uri' ]: data = get_content(uri, 'series') if data: ret = process_series(data) except: pass # Cache if ret: cache.put_series(uri, ret) return ret
def put_file ( name, data, imeta = {} ): log.debug('cache: put file %s' % name, 3) ret = None # Fix meta (use lower case keys) meta = {} for k in imeta: meta[k.lower()] = imeta[k] # Add MD5 if 'md5' not in meta: meta['md5'] = md5(data) # Store path = CACHE_PATH + os.path.sep + name if not os.path.exists(os.path.dirname(path)): os.makedirs(os.path.dirname(path)) open(path, 'w').write(data) open(path + '.meta', 'w').write(repr(meta)) log.debug('cache: file %s stored' % name)
def get_brand(uri, data=None): log.debug('get_brand(%s)' % uri, 4) # Check the cache ret = cache.get_brand(uri) # Get remote if ret is None: try: if not data or data.keys() == ['uri']: data = get_content(uri, 'brand') if data: ret = process_brand(data) except: pass # Put in cache if ret: cache.put_brand(uri, ret) return ret
def put_file(name, data, imeta={}): log.debug('cache: put file %s' % name, 3) ret = None # Fix meta (use lower case keys) meta = {} for k in imeta: meta[k.lower()] = imeta[k] # Add MD5 if 'md5' not in meta: meta['md5'] = md5(data) # Store path = CACHE_PATH + os.path.sep + name if not os.path.exists(os.path.dirname(path)): os.makedirs(os.path.dirname(path)) open(path, 'w').write(data) open(path + '.meta', 'w').write(repr(meta)) log.debug('cache: file %s stored' % name)
def get_series(uri, data=None): log.debug('get_series(%s)' % uri, 4) # Check cache ret = cache.get_series(uri) # Get remote if ret is None: try: if not data or data.keys() == ['uri']: data = get_content(uri, 'series') if data: ret = process_series(data) except: pass # Cache if ret: cache.put_series(uri, ret) return ret
def publisher_overlay ( a, b, pubs ): ignore_keys = conf.get('atlas_overlay_ignore', [ 'uri' ])#, 'transmission_end_time' ]) pa = a['publisher']['key'] pb = b['publisher']['key'] ia = -1 ib = -1 try: ia = pubs.index(pa) except: pass try: ib = pubs.index(pb) except: pass def _overlay ( a, b ): if type(b) == dict: for k in b: if k not in a: a[k] = b[k] elif k not in ignore_keys: a[k] = _overlay(a[k], b[k]) return a elif type(b) == list: for i in range(len(b)): if i < len(a): a[i] = _overlay(a[i], b[i]) else: a.append(b[i]) return a else: return b ret = None if ib < ia: t = a a = b b = t args = (a['uri'], a['broadcasts'][0]['transmission_time'].strftime('%H:%M'), a['broadcasts'][0]['transmission_end_time'].strftime('%H:%M'), b['uri'], b['broadcasts'][0]['transmission_time'].strftime('%H:%M'), b['broadcasts'][0]['transmission_end_time'].strftime('%H:%M')) log.debug('overlay %s @ %s-%s with %s @ %s-%s' % args, 6) ret = _overlay(a, b) return ret
def atlas_fetch(url, conn): jdata = None url = ('http://%s/3.0/' % ATLAS_API_HOST) + url log.debug('fetch %s' % url, 2) # Can fail occasionally - give more than 1 attempt t = 2.0 for i in range(5): try: data = cache.get_url(url, cache=False, conn=conn) if data: log.debug('decode json', 3) jdata = json.loads(data) log.debug(jdata, 5, pprint=True) break except Exception, e: import traceback traceback.print_ex log.warn('failed to fetch %s [e=%s]' % (url, e)) pass time.sleep(t) t *= 2
def atlas_fetch ( url, conn ): jdata = None url = ('http://%s/3.0/' % ATLAS_API_HOST) + url log.debug('fetch %s' % url, 2) # Can fail occasionally - give more than 1 attempt t = 2.0 for i in range(5): try: data = cache.get_url(url, cache=False, conn=conn) if data: log.debug('decode json', 3) jdata = json.loads(data) log.debug(jdata, 5, pprint=True) break except Exception, e: import traceback traceback.print_ex log.warn('failed to fetch %s [e=%s]' % (url, e)) pass time.sleep(t) t *= 2
def touch_file(name): log.debug('cache: touch %s' % name) path = CACHE_PATH + os.path.sep + name if os.path.exists(path): os.utime(path, None)
def get_url(url, cache=True, ttl=0, conn=None): import urllib2, urlparse log.debug('cache: get url %s' % url, 3) ret = None # Create directories urlp = urlparse.urlparse(url) path = urlp.netloc + os.path.sep + urlp.path[1:] http = urlp.scheme in ['http', 'https'] # Don't cache dynamic requests if urlp.params or urlp.query: cache = False # Create request req = urllib2.Request(url) req.add_header('User-Agent', PYEPG_USER_AGENT) # Check cache if cache: (data, meta, ttl, md5) = _get_file(path, ttl=ttl) # OK if data and meta and md5: # Check remote headers if not ttl: head = {} # Fetch remote headers if http and conn: log.debug('cache: use persistent connection', 5) conn.request('GET', url, None, {'User-Agent': PYEPG_USER_AGENT}) h = conn.getresponse().getheaders() for (k, v) in h: head[k.lower()] = v else: req.get_method = lambda: 'HEAD' up = urllib2.urlopen(req, timeout=60.0) head = up.headers # Static page unmodded if 'last-modified' in head and 'last-modified' in meta and\ head['last-modified'] == meta['last-modified']: log.debug('cache: last-modified matches', 4) ret = data # Update timestamp touch_file(path) # OK else: ret = data # Fetch if not ret: log.debug('cache: fetch remote', 1) head = {} if http and conn: log.debug('cache: use persistent connection', 5) conn.request('GET', url, None, {'User-Agent': PYEPG_USER_AGENT}) r = conn.getresponse() for (k, v) in r.getheaders(): head[k.lower()] = v ret = r.read() else: req.get_method = lambda: 'GET' up = urllib2.urlopen(req, timeout=60.0) ret = up.read() head = up.headers # Store if cache: put_file(path, ret, head) return ret
def get_url ( url, cache = True, ttl = 0, conn = None ): import urllib2, urlparse log.debug('cache: get url %s' % url, 3) ret = None # Create directories urlp = urlparse.urlparse(url) path = urlp.netloc + os.path.sep + urlp.path[1:] http = urlp.scheme in [ 'http', 'https' ] # Don't cache dynamic requests if urlp.params or urlp.query: cache = False # Create request req = urllib2.Request(url) req.add_header('User-Agent', PYEPG_USER_AGENT) # Check cache if cache: (data, meta, ttl, md5) = _get_file(path, ttl=ttl) # OK if data and meta and md5: # Check remote headers if not ttl: head = {} # Fetch remote headers if http and conn: log.debug('cache: use persistent connection', 5) conn.request('GET', url, None, {'User-Agent':PYEPG_USER_AGENT}) h = conn.getresponse().getheaders() for (k,v) in h: head[k.lower()] = v else: req.get_method = lambda: 'HEAD' up = urllib2.urlopen(req, timeout=60.0) head = up.headers # Static page unmodded if 'last-modified' in head and 'last-modified' in meta and\ head['last-modified'] == meta['last-modified']: log.debug('cache: last-modified matches', 4) ret = data # Update timestamp touch_file(path) # OK else: ret = data # Fetch if not ret: log.debug('cache: fetch remote', 1) head = {} if http and conn: log.debug('cache: use persistent connection', 5) conn.request('GET', url, None, {'User-Agent':PYEPG_USER_AGENT}) r = conn.getresponse() for (k,v) in r.getheaders(): head[k.lower()] = v ret = r.read() else: req.get_method = lambda: 'GET' up = urllib2.urlopen(req, timeout=60.0) ret = up.read() head = up.headers # Store if cache: put_file(path, ret, head) return ret
def run ( self ): conn = None log.debug('atlas - grab thread %3d started' % self._idx, 0) # Create connection import httplib retry = conf.get('atlas_conn_retry_limit', 5) while not conn and retry: try: conn = httplib.HTTPConnection(ATLAS_API_HOST) log.debug('atlas - grab thread %3d conn created' % self._idx, 1) except: retry = retry - 1 time.sleep(conf.get('atlas_conn_retry_period', 2.0)) if not conn: log.error('atlas - grab thread %3d failed to connect') return # Config key = conf.get('atlas_apikey', None) p_pubs = conf.get('atlas_primary_publishers',\ [ 'bbc.co.uk', 'itv.com' 'tvblob.com',\ 'channel4.com' ]) s_pubs = conf.get('atlas_secondary_publishers',\ [ 'pressassociation.com' ]) anno = [ 'broadcasts', 'extended_description', 'series_summary',\ 'brand_summary', 'people' ] tsize = conf.get('atlas_time_chunk', self._stop - self._start) # Time tm_from = time.mktime(self._start.timetuple()) tm_to = time.mktime(self._stop.timetuple()) # URL base url = 'schedule.json?' url = url + 'annotations=' + ','.join(anno) if key: url = url + '&apiKey=' + key # Until queue exhausted while True: # Get next entry c = None try: c = self._inq.get_nowait() except Empty: break log.debug('atlas - grab thread %3d fetch %s' % (self._idx, c.title), 0) sched = [] # By time tf = tm_from while tf < tm_to: tt = min(tf + tsize, tm_to) a = (time.strftime('%Y-%m-%d %H:%M', time.localtime(tf)),\ time.strftime('%Y-%m-%d %H:%M', time.localtime(tt))) #log.info('atlas - period %s to %s' % a) # Process each publisher pubs = [] for p in s_pubs: pubs.append(p) for p in p_pubs: if p in c.publisher: pubs.append(p) log.debug('PUBS: %s' % pubs, 0) for p in pubs: #log.info('atlas - publisher %s' % p) u = url + '&from=%d&to=%d' % (tf, tt) u = u + '&publisher=' + p u = u + '&channel_id=' + c.shortid # Fetch data data = atlas_fetch(u, conn) # Processs if data and 'schedule' in data: for s in data['schedule']: if 'items' in s: sched.extend(s['items']) # Update tf = tf + tsize # Put into the output queue log.debug('atlas - grab thread %3d fetched %s' % (self._idx, c.title), 1) self._outq.put((c, pubs, sched)) self._inq.task_done() # Done if conn: conn.close() log.debug('atlas - grab thread %3d complete' % self._idx, 0)
def _get_file(name, ttl=None): import time log.debug('cache: get file %s' % name, 3) ok = False data = None meta = None valid = False path = CACHE_PATH + os.path.sep + name # Default TTL if ttl is None: ttl = conf.get('default_cache_ttl', 7 * 86400) # Check age if os.path.exists(path) and os.path.exists(path + '.meta'): log.debug('cache: %s in cache' % name, 4) st = os.stat(path) meta = eval(open(path + '.meta').read()) data = open(path).read() # OK if (st.st_mtime + ttl) > time.time(): log.debug('cache: %s ttl ok' % name, 4) ok = True # TTL passed else: log.debug('cache: %s ttl expired' % name, 4) # Validate if 'md5' in meta and meta['md5'] == md5(data): log.debug('cache: %s md5 ok' % name, 4) valid = True else: log.debug('cache: %s md5 mismatch' % name) # Return data return (data, meta, ok, valid)
def _get_file ( name, ttl = None ): import time log.debug('cache: get file %s' % name, 3) ok = False data = None meta = None valid = False path = CACHE_PATH + os.path.sep + name # Default TTL if ttl is None: ttl = conf.get('default_cache_ttl', 7*86400) # Check age if os.path.exists(path) and os.path.exists(path + '.meta'): log.debug('cache: %s in cache' % name, 4) st = os.stat(path) meta = eval(open(path + '.meta').read()) data = open(path).read() # OK if (st.st_mtime + ttl) > time.time(): log.debug('cache: %s ttl ok' % name, 4) ok = True # TTL passed else: log.debug('cache: %s ttl expired' % name, 4) # Validate if 'md5' in meta and meta['md5'] == md5(data): log.debug('cache: %s md5 ok' % name, 4) valid = True else: log.debug('cache: %s md5 mismatch' % name) # Return data return (data, meta, ok, valid)
def run(self): conn = None log.debug('atlas - grab thread %3d started' % self._idx, 0) # Create connection import httplib retry = conf.get('atlas_conn_retry_limit', 5) while not conn and retry: try: conn = httplib.HTTPConnection(ATLAS_API_HOST) log.debug('atlas - grab thread %3d conn created' % self._idx, 1) except: retry = retry - 1 time.sleep(conf.get('atlas_conn_retry_period', 2.0)) if not conn: log.error('atlas - grab thread %3d failed to connect') return # Config key = conf.get('atlas_apikey', None) p_pubs = conf.get('atlas_primary_publishers',\ [ 'bbc.co.uk', 'itv.com' 'tvblob.com',\ 'channel4.com' ]) s_pubs = conf.get('atlas_secondary_publishers',\ [ 'pressassociation.com' ]) anno = [ 'broadcasts', 'extended_description', 'series_summary',\ 'brand_summary', 'people' ] tsize = conf.get('atlas_time_chunk', self._stop - self._start) # Time tm_from = time.mktime(self._start.timetuple()) tm_to = time.mktime(self._stop.timetuple()) # URL base url = 'schedule.json?' url = url + 'annotations=' + ','.join(anno) if key: url = url + '&apiKey=' + key # Until queue exhausted while True: # Get next entry c = None try: c = self._inq.get_nowait() except Empty: break log.debug( 'atlas - grab thread %3d fetch %s' % (self._idx, c.title), 0) sched = [] # By time tf = tm_from while tf < tm_to: tt = min(tf + tsize, tm_to) a = (time.strftime('%Y-%m-%d %H:%M', time.localtime(tf)),\ time.strftime('%Y-%m-%d %H:%M', time.localtime(tt))) #log.info('atlas - period %s to %s' % a) # Process each publisher pubs = [] for p in s_pubs: pubs.append(p) for p in p_pubs: if p in c.publisher: pubs.append(p) log.debug('PUBS: %s' % pubs, 0) for p in pubs: #log.info('atlas - publisher %s' % p) u = url + '&from=%d&to=%d' % (tf, tt) u = u + '&publisher=' + p u = u + '&channel_id=' + c.shortid # Fetch data data = atlas_fetch(u, conn) # Processs if data and 'schedule' in data: for s in data['schedule']: if 'items' in s: sched.extend(s['items']) # Update tf = tf + tsize # Put into the output queue log.debug( 'atlas - grab thread %3d fetched %s' % (self._idx, c.title), 1) self._outq.put((c, pubs, sched)) self._inq.task_done() # Done if conn: conn.close() log.debug('atlas - grab thread %3d complete' % self._idx, 0)
def run(self): log.debug('atlas - data thread %3d started' % self._idx, 0) while True: c = sched = None try: (c, pubs, sched) = self._inq.get() except Empty: break log.debug( 'atlas - data thread %3d process %s' % (self._idx, c.title), 0) # Process times for s in sched: for i in range(len(s['broadcasts'])): for k in s['broadcasts'][i]: if 'time' in k: try: s['broadcasts'][i][k] = atlas_p_time( s['broadcasts'][i][k]) except: pass # Process overlays log.debug( 'atlas - data thread %3d overlay %s' % (self._idx, c.title), 1) log.debug('atlas - publishers %s' % pubs, 2) sched = process_publisher_overlay(sched, pubs) # Process into EPG log.debug( 'atlas - data thread %3d store %s' % (self._idx, c.title), 1) process_schedule(self._epg, c, sched) # Done self._inq.task_done() log.debug('atlas - data thread %3d complete' % self._idx, 0)
def run ( self ): log.debug('atlas - data thread %3d started' % self._idx, 0) while True: c = sched = None try: (c, pubs, sched) = self._inq.get() except Empty: break log.debug('atlas - data thread %3d process %s' % (self._idx, c.title), 0) # Process times for s in sched: for i in range(len(s['broadcasts'])): for k in s['broadcasts'][i]: if 'time' in k: try: s['broadcasts'][i][k] = atlas_p_time(s['broadcasts'][i][k]) except: pass # Process overlays log.debug('atlas - data thread %3d overlay %s' % (self._idx, c.title), 1) log.debug('atlas - publishers %s' % pubs, 2) sched = process_publisher_overlay(sched, pubs) # Process into EPG log.debug('atlas - data thread %3d store %s' % (self._idx, c.title), 1) process_schedule(self._epg, c, sched) # Done self._inq.task_done() log.debug('atlas - data thread %3d complete' % self._idx, 0)
def touch_file ( name ): log.debug('cache: touch %s' % name) path = CACHE_PATH + os.path.sep + name if os.path.exists(path): os.utime(path, None)