def test_attending_members_invalid_data(self): # file does not exist with CommitteeMeetingProtocol.get_from_filename('/foo/bar/baz') as protocol: with self.assertRaises(IOError): protocol.find_attending_members([]) # no text with CommitteeMeetingProtocol.get_from_text(None) as protocol: self.assertEqual([], protocol.find_attending_members([]))
def extract_speakers_from_txt_file(self,file_object_path,committee_id,meeting_id): text = object_storage.read(self.s3, "committees", file_object_path).decode() with CommitteeMeetingProtocol.get_from_text(text) as protocol: speakers = protocol.speakers if speakers is not None: for speaker in speakers: yield {"committee_id":committee_id, "meeting_id":meeting_id, "name":speaker }
def find_attending_members(self, mks=None, mk_names=None): logger.debug('find_attending_members') if mks is None and mk_names is None: logger.debug('get_all_mk_names') mks, mk_names = get_all_mk_names() with KnessetDataCommitteeMeetingProtocol.get_from_text(self.protocol_text) as protocol: attended_mk_names = protocol.find_attending_members(mk_names) for name in attended_mk_names: i = mk_names.index(name) if not mks[i].party_at(self.date): # not a member at time of this meeting? continue # then don't search for this MK. self.mks_attended.add(mks[i]) logger.debug('meeting %d now has %d attending members' % ( self.id, self.mks_attended.count()))
def create_protocol_parts(self, delete_existing=False, mks=None, mk_names=None): """ Create protocol parts from this instance's protocol_text Optionally, delete existing parts. If the meeting already has parts, and you don't ask to delete them, a ValidationError will be thrown, because it doesn't make sense to create the parts again. """ logger.debug('create_protocol_parts %s' % delete_existing) if delete_existing: ppct = ContentType.objects.get_for_model(ProtocolPart) annotations = Annotation.objects.filter(content_type=ppct, object_id__in=self.parts.all) logger.debug( 'deleting %d annotations, because I was asked to delete the relevant protocol parts on cm.id=%d' % ( annotations.count(), self.id)) annotations.delete() self.parts.all().delete() else: if self.parts.count(): raise ValidationError( 'CommitteeMeeting already has parts. delete them if you want to run create_protocol_parts again.') if not self.protocol_text: # sometimes there are empty protocols return # then we don't need to do anything here. if self.committee.type == 'plenum': create_plenum_protocol_parts(self, mks=mks, mk_names=mk_names) return else: def get_protocol_part(i, part): logger.debug('creating protocol part %s' % i) return ProtocolPart(meeting=self, order=i, header=part.header, body=part.body) with KnessetDataCommitteeMeetingProtocol.get_from_text( self.protocol_text) as protocol: # TODO: use bulk_create (I had a strange error when using it) # ProtocolPart.objects.bulk_create( # for testing, you could just save one part: # get_protocol_part(1, protocol.parts[0]).save() list([ get_protocol_part(i, part).save() for i, part in zip(range(1, len(protocol.parts) + 1), protocol.parts) ]) self.protocol_parts_update_date = datetime.now() self.save()
def create_protocol_parts(self, delete_existing=False, mks=None, mk_names=None): """ Create protocol parts from this instance's protocol_text Optionally, delete existing parts. If the meeting already has parts, and you don't ask to delete them, a ValidationError will be thrown, because it doesn't make sense to create the parts again. """ logger.debug('create_protocol_parts %s'%delete_existing) if delete_existing: ppct = ContentType.objects.get_for_model(ProtocolPart) annotations = Annotation.objects.filter(content_type=ppct, object_id__in=self.parts.all) logger.debug( 'deleting %d annotations, because I was asked to delete the relevant protocol parts on cm.id=%d' % ( annotations.count(), self.id)) annotations.delete() self.parts.all().delete() else: if self.parts.count(): raise ValidationError( 'CommitteeMeeting already has parts. delete them if you want to run create_protocol_parts again.') if not self.protocol_text: # sometimes there are empty protocols return # then we don't need to do anything here. if self.committee.type == 'plenum': create_plenum_protocol_parts(self, mks=mks, mk_names=mk_names) return else: def get_protocol_part(i, part): logger.debug('creating protocol part %s'%i) return ProtocolPart(meeting=self, order=i, header=part.header, body=part.body) with KnessetDataCommitteeMeetingProtocol.get_from_text(self.protocol_text) as protocol: # TODO: use bulk_create (I had a strange error when using it) # ProtocolPart.objects.bulk_create( # for testing, you could just save one part: # get_protocol_part(1, protocol.parts[0]).save() list([ get_protocol_part(i, part).save() for i, part in zip(range(1, len(protocol.parts)+1), protocol.parts) ]) self.protocol_parts_update_date = datetime.now() self.save()
def get_kns_committeesession_resource(): for committeesession_row in kns_committeesession_resource: if ((not parameters.get("filter-meeting-id") or int(committeesession_row["CommitteeSessionID"]) in parameters["filter-meeting-id"]) and (not parameters.get("filter-committee-id") or int(committeesession_row["CommitteeID"]) in parameters["filter-committee-id"]) and (not parameters.get("filter-knesset-num") or int(committeesession_row["KnessetNum"]) in parameters["filter-knesset-num"])): # text_file_name text_file_size # data/committees/meeting_protocols_text/files/5/7/570611.txt 72817 if (committeesession_row["text_file_name"] and committeesession_row["text_file_size"] and committeesession_row["text_file_size"] > 0): protocol_text_url = "https://storage.googleapis.com/knesset-data-pipelines/{}".format( committeesession_row["text_file_name"]) text = requests.get(protocol_text_url).content.decode("utf-8") with CommitteeMeetingProtocol.get_from_text(text) as protocol: committeesession_row.update(protocol.attendees) yield committeesession_row
def find_attending_members(self, mks=None, mk_names=None): logger.debug('find_attending_members') if mks is None and mk_names is None: logger.debug('get_all_mk_names') mks, mk_names = get_all_mk_names() try: with KnessetDataCommitteeMeetingProtocol.get_from_text(self.protocol_text) as protocol: attended_mk_names = protocol.find_attending_members(mk_names) for name in attended_mk_names: i = mk_names.index(name) if not mks[i].party_at(self.date): # not a member at time of this meeting? continue # then don't search for this MK. self.mks_attended.add(mks[i]) except Exception: exceptionType, exceptionValue, exceptionTraceback = sys.exc_info() logger.debug("%s%s", ''.join(traceback.format_exception(exceptionType, exceptionValue, exceptionTraceback) ), '\nCommitteeMeeting.id=' + str(self.id)) logger.debug('meeting %d now has %d attending members' % ( self.id, self.mks_attended.count()))
def extract_attendees_from_txt_file(self,file_object_path,committee_id,meeting_id): text = object_storage.read(self.s3, "committees",file_object_path).decode() with CommitteeMeetingProtocol.get_from_text(text) as protocol: attendees = protocol.attendees if attendees is not None: for key in attendees.keys(): for attendee in attendees[key]: if key == "invitees": yield {"committee_id":committee_id, "meeting_id":meeting_id, "name":attendee["name"], "role":"invitees", "additional_information":attendee["role"] if "role" in attendee.keys() else ""} else: yield {"committee_id":committee_id, "meeting_id":meeting_id, "name":attendee, "role":key, "additional_information":""}
def process_row(row, row_index, spec, resource_index, parameters, stats): if spec['name'] == 'kns_committeesession': row.update(mks=None, invitees=None, legal_advisors=None, manager=None) if ((not parameters.get("filter-meeting-id") or int( row["CommitteeSessionID"]) in parameters["filter-meeting-id"]) and (not parameters.get("filter-committee-id") or int(row["CommitteeID"]) in parameters["filter-committee-id"]) and (not parameters.get("filter-knesset-num") or int(row["KnessetNum"]) in parameters["filter-knesset-num"])): if row["text_parsed_filename"]: new_cache_hash, old_cache_hash, cache_hash_path, cache_hash_row = None, None, None, None if os.environ.get('KNESSET_PIPELINES_DATA_PATH'): m = BASE_HASH_OBJ.copy() m.update(str(row['text_crc32c']).encode()) m.update(str(row['parts_crc32c']).encode()) new_cache_hash = m.hexdigest() cache_hash_path = os.path.join( os.environ['KNESSET_PIPELINES_DATA_PATH'], 'people/committees/meeting-attendees/cache_hash/{}.json' .format(row["text_parsed_filename"])) if os.path.exists(cache_hash_path): with open(cache_hash_path) as f: cache_data = json.load(f) old_cache_hash = cache_data['hash'] cache_hash_row = cache_data['row'] if cache_hash_path and old_cache_hash and old_cache_hash == new_cache_hash: row.update(**cache_hash_row) else: logging.info('getting attendees for meeting {}'.format( row['CommitteeSessionID'])) text = None if os.environ.get('KNESSET_PIPELINES_DATA_PATH'): protocol_text_path = os.path.join( os.environ['KNESSET_PIPELINES_DATA_PATH'], 'committees/meeting_protocols_text/{}'.format( row["text_parsed_filename"])) if os.path.exists( protocol_text_path ) and os.path.getsize(protocol_text_path) > 0: with open(protocol_text_path) as f: text = f.read() else: protocol_text_url = "https://storage.googleapis.com/knesset-data-pipelines/data/committees/" \ "meeting_protocols_text/{}".format(row["text_parsed_filename"]) res = requests.get(protocol_text_url) if res.status_code == 200: text = res.content.decode("utf-8") update_row = dict(mks=None, invitees=None, legal_advisors=None, manager=None) if text: with CommitteeMeetingProtocol.get_from_text( text) as protocol: attendees = protocol.attendees if attendees: update_row = dict( mks=attendees['mks'], invitees=attendees['invitees'], legal_advisors=attendees['legal_advisors'], manager=attendees['manager'], financial_advisors=attendees.get( 'financial_advisors', [])) row.update(**update_row) if cache_hash_path: os.makedirs(os.path.dirname(cache_hash_path), exist_ok=True) with open(cache_hash_path, 'w') as f: json.dump( { 'hash': new_cache_hash, 'row': update_row }, f) return row