def run(db, es, options={}): try: page = urllib2.urlopen( "http://www.senate.gov/general/committee_schedules/hearings.xml") except: db.note("Couldn't load Senate hearings feed, can't proceed") else: soup = BeautifulStoneSoup(page) meetings = soup.findAll('meeting') parser = HTMLParser.HTMLParser() count = 0 for meeting in meetings: if re.search("^No.*?scheduled\.?$", meeting.matter.contents[0]): continue full_id = meeting.cmte_code.contents[0].strip() match = re.search("^([A-Z]+)(\d+)$", full_id) if match: committee_id, subcommittee_id = match.groups() else: committee_id, subcommittee_id = full_id, None if (subcommittee_id == "00") or (subcommittee_id == None): subcommittee_id = None else: subcommittee_id = full_id committee = committee_for(db, committee_id) # Don't warn if it's a bill-specific conference committee if committee: chamber = committee['chamber'] else: if committee_id == "JCC": chamber = "joint" else: db.warning( "Couldn't locate committee by committee_id %s" % committee_id, {'committee_id': committee_id}) continue committee_url = meeting.committee['url'] date_string = meeting.date.contents[0].strip() occurs_at = datetime.datetime(*time.strptime( date_string, "%d-%b-%Y %I:%M %p")[0:6], tzinfo=python_utils.EST()) congress = python_utils.current_congress(occurs_at.year) document = None if meeting.document: document = meeting.document.contents[0].strip() room = meeting.room.contents[0].strip() description = meeting.matter.contents[0].strip().replace('\n', '') # content is double-escaped, e.g. " description = parser.unescape(parser.unescape(description)) bill_ids = python_utils.extract_bills(description, congress) if subcommittee_id == None: sub = '00' else: sub = str(subcommittee_id) # making sure the ids are more reproducable date_string = occurs_at.strftime("%d-%b-%Y %I:%M %p") id_string = (date_string + str(committee_id) + sub).encode("utf-8") hearing_id = hashlib.md5(id_string).hexdigest() documents = db['hearings'].find({ 'chamber': chamber, 'committee_id': committee_id, "$or": [{ 'occurs_at': occurs_at }, { 'description': description }] }) hearing = None if documents.count() > 0: hearing = documents[0] else: hearing = { 'chamber': chamber, 'committee_id': committee_id, 'hearing_id': hearing_id } hearing['created_at'] = datetime.datetime.now() if subcommittee_id: hearing['subcommittee_id'] = subcommittee_id hearing['updated_at'] = datetime.datetime.now() hearing.update({ 'congress': congress, 'occurs_at': occurs_at, 'room': room, 'description': description, 'dc': True, 'bill_ids': bill_ids }) if committee: hearing['committee'] = committee db['hearings'].save(hearing) count += 1 db.success("Updated or created %s Senate committee hearings" % count)
def get_videos(db, es, client_name, chamber, archive=False, captions=False): api_url = API_PREFIX + client_name + '?type=video' data = '{ "sort": [ {"datetime": {"order": "desc" }} ] }' if archive: api_url += '&size=100000' else: api_url += '&size=2' videos = query_api(db, api_url, data) if not videos: db.warning("Granicus API appears to be down", {'errors': PARSING_ERRORS}) sys.exit() vcount = 0 for vid in videos: v = vid['_source'] legislative_day = dateparse(v['datetime']) video_id = chamber + '-' + str(int(timey.mktime(legislative_day.timetuple()))) new_vid = db.get_or_initialize('videos', {'video_id': video_id}) #initialize arrays and dicts so we don't have to worry about it later if not new_vid.has_key('clip_urls'): new_vid['clip_urls'] = {} if not new_vid.has_key('bill_ids'): new_vid['bill_ids'] = [] if not new_vid.has_key('legislator_ids'): new_vid['legislator_ids'] = [] if not new_vid.has_key('legislator_names'): new_vid['legislator_names'] = [] if not new_vid.has_key('created_at'): new_vid['created_at'] = datetime.now() new_vid['updated_at'] = datetime.now() #video id, clips array, legislators array, bills array new_vid = try_key(v, 'id', 'clip_id', new_vid) new_vid = try_key(v, 'duration', 'duration', new_vid) new_vid = try_key(v, 'datetime', 'published_at', new_vid) # normalize timestamp format to RFC3339 in UTC new_vid['published_at'] = rfc3339(dateparse(new_vid['published_at'])) new_vid['clip_urls'] = try_key(v, 'http', 'mp4', new_vid['clip_urls']) new_vid['clip_urls'] = try_key(v, 'hls', 'hls', new_vid['clip_urls']) new_vid['clip_urls'] = try_key(v, 'rtmp', 'rtmp', new_vid['clip_urls']) new_vid['legislative_day'] = legislative_day.strftime('%Y-%m-%d') new_vid['chamber'] = chamber new_vid['congress'] = python_utils.current_congress(legislative_day.year) if chamber == 'house': new_vid['clips'], new_vid['bill_ids'], new_vid['legislator_names'], new_vid['legislator_ids'], new_vid['roll_ids'] = get_markers(db, client_name, new_vid['clip_id'], new_vid['congress'], chamber) elif chamber == 'senate': new_vid['clips'], new_vid['bill_ids'], new_vid['legislator_names'], new_vid['legislator_ids'], new_vid['roll_ids'] = get_clips_for_senate(db, new_vid['clip_id'], new_vid['congress'], new_vid['duration'], dateparse(new_vid['published_at']).year) if new_vid['clips'] is None: print "Couldn't fetch information for video, skipping." continue #make sure the last clip has a duration if new_vid['clips'] and len(new_vid['clips']) > 0: new_vid['clips'][-1]['duration'] = new_vid['duration'] - new_vid['clips'][-1]['offset'] if captions: new_vid['captions'], new_vid['caption_srt_file'] = get_captions(client_name, new_vid['clip_id']) db['videos'].save(new_vid) vcount += 1 #index clip objects in elastic search if captions and new_vid.has_key('clips') and new_vid['clips'] is not None and len(new_vid['clips']) > 0: for c in new_vid['clips']: clip = { 'id': "%s-%s" % (new_vid['video_id'], new_vid['clips'].index(c)), 'video_id': new_vid['video_id'], 'video_clip_id': new_vid['clip_id'], 'offset': c['offset'], 'duration': c['duration'], 'legislative_day': new_vid['legislative_day'], 'published_at': new_vid['published_at'], 'clip_urls': new_vid['clip_urls'] } clip = try_key(c, 'legislator_names', 'legislator_names', clip) clip = try_key(c, 'roll_ids', 'roll_ids', clip) clip = try_key(c, 'events', 'events', clip) clip = try_key(c, 'bill_ids', 'bill_ids', clip) clip = try_key(c, 'legislator_ids', 'legislator_ids', clip) if new_vid.has_key('caption_srt_file'): clip['srt_link'] = new_vid['caption_srt_file'], if new_vid.has_key('captions'): clip['captions'] = get_clip_captions(new_vid, c, c == new_vid['clips'][0] ) #pass a boolean if this is the first clip resp = es.save(clip, 'clips', clip['id']) print "Successfully processed %s" % new_vid['clip_id'] es.connection.indices.refresh() db.success("Updated or created %s legislative days for %s video" % (client_name, vcount))
def get_videos(db, es, client_name, chamber, archive=False, captions=False): api_url = API_PREFIX + client_name + '?type=video' data = '{ "sort": [ {"datetime": {"order": "desc" }} ] }' if archive: api_url += '&size=100000' else: api_url += '&size=2' videos = query_api(db, api_url, data) if not videos: db.warning("Granicus API appears to be down", {'errors': PARSING_ERRORS}) sys.exit() vcount = 0 for vid in videos: v = vid['_source'] legislative_day = dateparse(v['datetime']) video_id = chamber + '-' + str( int(timey.mktime(legislative_day.timetuple()))) new_vid = db.get_or_initialize('videos', {'video_id': video_id}) #initialize arrays and dicts so we don't have to worry about it later if not new_vid.has_key('clip_urls'): new_vid['clip_urls'] = {} if not new_vid.has_key('bill_ids'): new_vid['bill_ids'] = [] if not new_vid.has_key('legislator_ids'): new_vid['legislator_ids'] = [] if not new_vid.has_key('legislator_names'): new_vid['legislator_names'] = [] if not new_vid.has_key('created_at'): new_vid['created_at'] = datetime.now() new_vid['updated_at'] = datetime.now() #video id, clips array, legislators array, bills array new_vid = try_key(v, 'id', 'clip_id', new_vid) new_vid = try_key(v, 'duration', 'duration', new_vid) new_vid = try_key(v, 'datetime', 'published_at', new_vid) # normalize timestamp format to RFC3339 in UTC new_vid['published_at'] = rfc3339(dateparse(new_vid['published_at'])) new_vid['clip_urls'] = try_key(v, 'http', 'mp4', new_vid['clip_urls']) new_vid['clip_urls'] = try_key(v, 'hls', 'hls', new_vid['clip_urls']) new_vid['clip_urls'] = try_key(v, 'rtmp', 'rtmp', new_vid['clip_urls']) new_vid['legislative_day'] = legislative_day.strftime('%Y-%m-%d') new_vid['chamber'] = chamber new_vid['congress'] = python_utils.current_congress( legislative_day.year) if chamber == 'house': new_vid['clips'], new_vid['bill_ids'], new_vid[ 'legislator_names'], new_vid['legislator_ids'], new_vid[ 'roll_ids'] = get_markers(db, client_name, new_vid['clip_id'], new_vid['congress'], chamber) elif chamber == 'senate': new_vid['clips'], new_vid['bill_ids'], new_vid[ 'legislator_names'], new_vid['legislator_ids'], new_vid[ 'roll_ids'] = get_clips_for_senate( db, new_vid['clip_id'], new_vid['congress'], new_vid['duration'], dateparse(new_vid['published_at']).year) if new_vid['clips'] is None: print "Couldn't fetch information for video, skipping." continue #make sure the last clip has a duration if new_vid['clips'] and len(new_vid['clips']) > 0: new_vid['clips'][-1]['duration'] = new_vid['duration'] - new_vid[ 'clips'][-1]['offset'] if captions: new_vid['captions'], new_vid['caption_srt_file'] = get_captions( client_name, new_vid['clip_id']) db['videos'].save(new_vid) vcount += 1 #index clip objects in elastic search if captions and new_vid.has_key( 'clips') and new_vid['clips'] is not None and len( new_vid['clips']) > 0: for c in new_vid['clips']: clip = { 'id': "%s-%s" % (new_vid['video_id'], new_vid['clips'].index(c)), 'video_id': new_vid['video_id'], 'video_clip_id': new_vid['clip_id'], 'offset': c['offset'], 'duration': c['duration'], 'legislative_day': new_vid['legislative_day'], 'published_at': new_vid['published_at'], 'clip_urls': new_vid['clip_urls'] } clip = try_key(c, 'legislator_names', 'legislator_names', clip) clip = try_key(c, 'roll_ids', 'roll_ids', clip) clip = try_key(c, 'events', 'events', clip) clip = try_key(c, 'bill_ids', 'bill_ids', clip) clip = try_key(c, 'legislator_ids', 'legislator_ids', clip) if new_vid.has_key('caption_srt_file'): clip['srt_link'] = new_vid['caption_srt_file'], if new_vid.has_key('captions'): clip['captions'] = get_clip_captions( new_vid, c, c == new_vid['clips'] [0]) #pass a boolean if this is the first clip resp = es.save(clip, 'clips', clip['id']) print "Successfully processed %s" % new_vid['clip_id'] es.connection.indices.refresh() db.success("Updated or created %s legislative days for %s video" % (client_name, vcount))
def run(db, es, options = {}): try: page = urllib2.urlopen("http://www.senate.gov/general/committee_schedules/hearings.xml") except: db.note("Couldn't load Senate hearings feed, can't proceed") else: soup = BeautifulStoneSoup(page) meetings = soup.findAll('meeting') parser = HTMLParser.HTMLParser() count = 0 for meeting in meetings: if re.search("^No.*?scheduled\.?$", meeting.matter.contents[0]): continue full_id = meeting.cmte_code.contents[0].strip() match = re.search("^([A-Z]+)(\d+)$", full_id) if match: committee_id, subcommittee_id = match.groups() else: committee_id, subcommittee_id = full_id, None if (subcommittee_id == "00") or (subcommittee_id == None): subcommittee_id = None else: subcommittee_id = full_id committee = committee_for(db, committee_id) # Don't warn if it's a bill-specific conference committee if committee: chamber = committee['chamber'] else: if committee_id == "JCC": chamber = "joint" else: db.warning("Couldn't locate committee by committee_id %s" % committee_id, {'committee_id': committee_id}) continue committee_url = meeting.committee['url'] date_string = meeting.date.contents[0].strip() occurs_at = datetime.datetime(*time.strptime(date_string, "%d-%b-%Y %I:%M %p")[0:6], tzinfo=python_utils.EST()) congress = python_utils.current_congress(occurs_at.year) document = None if meeting.document: document = meeting.document.contents[0].strip() room = meeting.room.contents[0].strip() description = meeting.matter.contents[0].strip().replace('\n', '') # content is double-escaped, e.g. " description = parser.unescape(parser.unescape(description)) bill_ids = python_utils.extract_bills(description, congress) if subcommittee_id == None: sub = '00' else: sub = str(subcommittee_id) # making sure the ids are more reproducable date_string = occurs_at.strftime("%d-%b-%Y %I:%M %p") id_string = (date_string + str(committee_id) + sub).encode("utf-8") hearing_id = hashlib.md5(id_string).hexdigest() documents = db['hearings'].find({ 'chamber': chamber, 'committee_id': committee_id, "$or": [{ 'occurs_at': occurs_at },{ 'description': description }] }) hearing = None if documents.count() > 0: hearing = documents[0] else: hearing = { 'chamber': chamber, 'committee_id': committee_id, 'hearing_id': hearing_id } hearing['created_at'] = datetime.datetime.now() if subcommittee_id: hearing['subcommittee_id'] = subcommittee_id hearing['updated_at'] = datetime.datetime.now() hearing.update({ 'congress': congress, 'occurs_at': occurs_at, 'room': room, 'description': description, 'dc': True, 'bill_ids': bill_ids }) if committee: hearing['committee'] = committee db['hearings'].save(hearing) count += 1 db.success("Updated or created %s Senate committee hearings" % count)