def run(db, es, options = {}): try: page = urllib2.urlopen("http://www.senate.gov/general/committee_schedules/hearings.xml") except: db.note("Couldn't load Senate hearings feed, can't proceed") else: soup = BeautifulStoneSoup(page) meetings = soup.findAll('meeting') parser = HTMLParser.HTMLParser() count = 0 for meeting in meetings: if re.search("^No.*?scheduled\.?$", meeting.matter.contents[0]): continue full_id = meeting.cmte_code.contents[0].strip() match = re.search("^([A-Z]+)(\d+)$", full_id) if match: committee_id, subcommittee_id = match.groups() else: committee_id, subcommittee_id = full_id, None if (subcommittee_id == "00") or (subcommittee_id == None): subcommittee_id = None else: subcommittee_id = full_id committee = committee_for(db, committee_id) # Don't warn if it's a bill-specific conference committee if committee: chamber = committee['chamber'] else: if committee_id == "JCC": chamber = "joint" else: db.warning("Couldn't locate committee by committee_id %s" % committee_id, {'committee_id': committee_id}) continue committee_url = meeting.committee['url'] date_string = meeting.date.contents[0].strip() occurs_at = datetime.datetime(*time.strptime(date_string, "%d-%b-%Y %I:%M %p")[0:6], tzinfo=rtc_utils.EST()) congress = rtc_utils.current_congress(occurs_at.year) document = None if meeting.document: document = meeting.document.contents[0].strip() room = meeting.room.contents[0].strip() description = meeting.matter.contents[0].strip().replace('\n', '') # content is double-escaped, e.g. " description = parser.unescape(parser.unescape(description)) bill_ids = rtc_utils.extract_bills(description, congress) documents = db['hearings'].find({ 'chamber': chamber, 'committee_id': committee_id, "$or": [{ 'occurs_at': occurs_at },{ 'description': description }] }) hearing = None if documents.count() > 0: hearing = documents[0] else: hearing = { 'chamber': chamber, 'committee_id': committee_id } hearing['created_at'] = datetime.datetime.now() if subcommittee_id: hearing['subcommittee_id'] = subcommittee_id hearing['updated_at'] = datetime.datetime.now() hearing.update({ 'congress': congress, 'occurs_at': occurs_at, 'room': room, 'description': description, 'dc': True, 'bill_ids': bill_ids }) if committee: hearing['committee'] = committee db['hearings'].save(hearing) count += 1 db.success("Updated or created %s Senate committee hearings" % count)
def run(db, es, options={}): try: page = urllib2.urlopen("http://www.senate.gov/general/committee_schedules/hearings.xml") except: db.note("Couldn't load Senate hearings feed, can't proceed") else: soup = BeautifulStoneSoup(page) meetings = soup.findAll("meeting") parser = HTMLParser.HTMLParser() count = 0 for meeting in meetings: if re.search("^No.*?scheduled\.?$", meeting.matter.contents[0]): continue full_id = meeting.cmte_code.contents[0].strip() committee_id, subcommittee_id = re.search("^([A-Z]+)(\d+)$", full_id).groups() if subcommittee_id == "00": subcommittee_id = None else: subcommittee_id = full_id committee = committee_for(db, committee_id) # Don't warn if it's a bill-specific conference committee if committee: chamber = committee["chamber"] else: if committee_id != "JCC": chamber = "joint" else: db.warning( "Couldn't locate committee by committee_id %s" % committee_id, {"committee_id": committee_id} ) continue committee_url = meeting.committee["url"] date_string = meeting.date.contents[0].strip() occurs_at = datetime.datetime(*time.strptime(date_string, "%d-%b-%Y %I:%M %p")[0:6], tzinfo=rtc_utils.EST()) congress = rtc_utils.current_congress(occurs_at.year) document = None if meeting.document: document = meeting.document.contents[0].strip() room = meeting.room.contents[0].strip() description = meeting.matter.contents[0].strip().replace("\n", "") # content is double-escaped, e.g. " description = parser.unescape(parser.unescape(description)) bill_ids = rtc_utils.extract_bills(description, congress) documents = db["hearings"].find( { "chamber": chamber, "committee_id": committee_id, "$or": [{"occurs_at": occurs_at}, {"description": description}], } ) hearing = None if documents.count() > 0: hearing = documents[0] else: hearing = {"chamber": chamber, "committee_id": committee_id} hearing["created_at"] = datetime.datetime.now() if subcommittee_id: hearing["subcommittee_id"] = subcommittee_id hearing["updated_at"] = datetime.datetime.now() hearing.update( { "congress": congress, "occurs_at": occurs_at, "room": room, "description": description, "dc": True, "bill_ids": bill_ids, } ) if committee: hearing["committee"] = committee db["hearings"].save(hearing) count += 1 db.success("Updated or created %s Senate committee hearings" % count)
def run(db, es, options={}): try: page = urllib2.urlopen( "http://www.senate.gov/general/committee_schedules/hearings.xml") except: db.note("Couldn't load Senate hearings feed, can't proceed") else: soup = BeautifulStoneSoup(page) meetings = soup.findAll('meeting') parser = HTMLParser.HTMLParser() count = 0 for meeting in meetings: if re.search("^No.*?scheduled\.?$", meeting.matter.contents[0]): continue full_id = meeting.cmte_code.contents[0].strip() committee_id, subcommittee_id = re.search("^([A-Z]+)(\d+)$", full_id).groups() if subcommittee_id == "00": subcommittee_id = None else: subcommittee_id = full_id committee = committee_for(db, committee_id) # Don't warn if it's a bill-specific conference committee if committee: chamber = committee['chamber'] else: if committee_id != "JCC": chamber = "joint" else: db.warning( "Couldn't locate committee by committee_id %s" % committee_id, {'committee_id': committee_id}) continue committee_url = meeting.committee['url'] date_string = meeting.date.contents[0].strip() occurs_at = datetime.datetime(*time.strptime( date_string, "%d-%b-%Y %I:%M %p")[0:6], tzinfo=rtc_utils.EST()) congress = rtc_utils.current_congress(occurs_at.year) document = None if meeting.document: document = meeting.document.contents[0].strip() room = meeting.room.contents[0].strip() description = meeting.matter.contents[0].strip().replace('\n', '') # content is double-escaped, e.g. " description = parser.unescape(parser.unescape(description)) bill_ids = rtc_utils.extract_bills(description, congress) documents = db['hearings'].find({ 'chamber': chamber, 'committee_id': committee_id, "$or": [{ 'occurs_at': occurs_at }, { 'description': description }] }) hearing = None if documents.count() > 0: hearing = documents[0] else: hearing = {'chamber': chamber, 'committee_id': committee_id} hearing['created_at'] = datetime.datetime.now() if subcommittee_id: hearing['subcommittee_id'] = subcommittee_id hearing['updated_at'] = datetime.datetime.now() hearing.update({ 'congress': congress, 'occurs_at': occurs_at, 'room': room, 'description': description, 'dc': True, 'bill_ids': bill_ids }) if committee: hearing['committee'] = committee db['hearings'].save(hearing) count += 1 db.success("Updated or created %s Senate committee hearings" % count)
def get_videos(db, es, client_name, chamber, archive=False, captions=False): api_url = API_PREFIX + client_name + '?type=video' data = '{ "sort": [ {"datetime": {"order": "desc" }} ] }' if archive: api_url += '&size=100000' else: api_url += '&size=2' videos = query_api(db, api_url, data) if not videos: db.warning("Granicus API appears to be down", {'errors': PARSING_ERRORS}) sys.exit() vcount = 0 for vid in videos: v = vid['_source'] legislative_day = dateparse(v['datetime']) video_id = chamber + '-' + str(int(timey.mktime(legislative_day.timetuple()))) new_vid = db.get_or_initialize('videos', {'video_id': video_id}) #initialize arrays and dicts so we don't have to worry about it later if not new_vid.has_key('clip_urls'): new_vid['clip_urls'] = {} if not new_vid.has_key('bill_ids'): new_vid['bill_ids'] = [] if not new_vid.has_key('legislator_ids'): new_vid['legislator_ids'] = [] if not new_vid.has_key('legislator_names'): new_vid['legislator_names'] = [] if not new_vid.has_key('created_at'): new_vid['created_at'] = datetime.now() new_vid['updated_at'] = datetime.now() #video id, clips array, legislators array, bills array new_vid = try_key(v, 'id', 'clip_id', new_vid) new_vid = try_key(v, 'duration', 'duration', new_vid) new_vid = try_key(v, 'datetime', 'published_at', new_vid) # normalize timestamp format to RFC3339 in UTC new_vid['published_at'] = rfc3339.rfc3339(dateparse(new_vid['published_at']), utc=True) new_vid['clip_urls'] = try_key(v, 'http', 'mp4', new_vid['clip_urls']) new_vid['clip_urls'] = try_key(v, 'hls', 'hls', new_vid['clip_urls']) new_vid['clip_urls'] = try_key(v, 'rtmp', 'rtmp', new_vid['clip_urls']) new_vid['legislative_day'] = legislative_day.strftime('%Y-%m-%d') new_vid['chamber'] = chamber new_vid['congress'] = rtc_utils.current_congress(legislative_day.year) if chamber == 'house': new_vid['clips'], new_vid['bill_ids'], new_vid['legislator_names'], new_vid['legislator_ids'], new_vid['roll_ids'] = get_markers(db, client_name, new_vid['clip_id'], new_vid['congress'], chamber) elif chamber == 'senate': new_vid['clips'], new_vid['bill_ids'], new_vid['legislator_names'], new_vid['legislator_ids'], new_vid['roll_ids'] = get_clips_for_senate(db, new_vid['clip_id'], new_vid['congress'], new_vid['duration'], dateparse(new_vid['published_at']).year) #make sure the last clip has a duration if new_vid['clips'] and len(new_vid['clips']) > 0: new_vid['clips'][-1]['duration'] = new_vid['duration'] - new_vid['clips'][-1]['offset'] if captions: new_vid['captions'], new_vid['caption_srt_file'] = get_captions(client_name, new_vid['clip_id']) db['videos'].save(new_vid) vcount += 1 #index clip objects in elastic search if captions and new_vid.has_key('clips') and new_vid['clips'] is not None and len(new_vid['clips']) > 0: for c in new_vid['clips']: clip = { 'id': "%s-%s" % (new_vid['video_id'], new_vid['clips'].index(c)), 'video_id': new_vid['video_id'], 'video_clip_id': new_vid['clip_id'], 'offset': c['offset'], 'duration': c['duration'], 'legislative_day': new_vid['legislative_day'], 'published_at': new_vid['published_at'], 'clip_urls': new_vid['clip_urls'] } clip = try_key(c, 'legislator_names', 'legislator_names', clip) clip = try_key(c, 'roll_ids', 'roll_ids', clip) clip = try_key(c, 'events', 'events', clip) clip = try_key(c, 'bill_ids', 'bill_ids', clip) clip = try_key(c, 'legislator_ids', 'legislator_ids', clip) if new_vid.has_key('caption_srt_file'): clip['srt_link'] = new_vid['caption_srt_file'], if new_vid.has_key('captions'): clip['captions'] = get_clip_captions(new_vid, c, c == new_vid['clips'][0] ) #pass a boolean if this is the first clip resp = es.save(clip, 'clips', clip['id']) if resp['ok'] == False: PARSING_ERRORS.append('Could not successfully save to elasticsearch - video_id: %s' % resp['_id']) print "Successfully processed %s" % new_vid['clip_id'] es.connection.refresh() db.success("Updated or created %s legislative days for %s video" % (client_name, vcount))