def test_possible_matching_speakers(self): sitting = Sitting( start_date=datetime.date(2011, 1, 2), ) entry = Entry( sitting=sitting, ) james_smith = Person.objects.create( legal_name='James Smith', slug='james-smith', ) james_smith2 = Person.objects.create( title='Mr', legal_name='Bob Smith James', slug='james-smith2', ) mp = PositionTitle.objects.create( name='Member of Parliament', slug='mp', ) Position.objects.create( person=james_smith, title=mp, start_date=ApproximateDate(year=2011, month=1, day=1), end_date=ApproximateDate(future=True), category='political', ) Position.objects.create( person=james_smith2, title=mp, start_date=ApproximateDate(year=2011, month=1, day=1), end_date=ApproximateDate(future=True), category='political', ) entry.speaker_name = 'James Smith' speakers = entry.possible_matching_speakers(name_matching_algorithm=NAME_SUBSTRING_MATCH) self.assertListEqual(list(speakers), [james_smith]) entry.speaker_name = 'Mr Smith' speakers = entry.possible_matching_speakers(name_matching_algorithm=NAME_SUBSTRING_MATCH) self.assertItemsEqual(speakers, (james_smith, james_smith2)) speakers = entry.possible_matching_speakers(name_matching_algorithm=NAME_SET_INTERSECTION_MATCH) self.assertListEqual(list(speakers), [james_smith2]) entry.speaker_name = 'Mr James Smith' speakers = entry.possible_matching_speakers(name_matching_algorithm=NAME_SUBSTRING_MATCH) self.assertListEqual(list(speakers), [james_smith]) speakers = entry.possible_matching_speakers(name_matching_algorithm=NAME_SET_INTERSECTION_MATCH) self.assertListEqual(list(speakers), [james_smith2, james_smith])
def create_entries_from_data_and_source(cls, data, source): """Create the needed sitting and entries""" venue = Venue.objects.get(slug=data['meta']['venue']) # Joint Sittings can be published by both Houses (identical documents) # prevent the same Sitting being created twice if 'Joint Sitting' in source.name \ and Sitting.objects.filter( venue=venue, source__name=source.name, start_date=source.date, start_time=data['meta'].get('start_time', None) ).exists(): print "skipping duplicate source %s for %s" % (source.name, source.date) return None sitting = Sitting( source=source, venue=venue, start_date=source.date, start_time=data['meta'].get('start_time', None), end_date=source.date, end_time=data['meta'].get('end_time', None), ) sitting.save() with transaction.commit_on_success(): counter = 0 for line in data['transcript']: counter += 1 entry = Entry( sitting=sitting, type=line['type'], page_number=line['page_number'], text_counter=counter, speaker_name=line.get('speaker_name', ''), speaker_title=line.get('speaker_title', ''), content=line['text'], ) entry.save() source.last_processing_success = datetime.datetime.now() source.save() return None
def test_multiple_politician_name_matches_senate(self): entry = Entry( sitting=self.senate_sitting, type='text', page_number=12, text_counter=4, speaker_name='Jones', speaker_title='Hon.', content='test', ) possible_speakers = entry.possible_matching_speakers( name_matching_algorithm=NAME_SUBSTRING_MATCH) self.assertEqual(1, len(possible_speakers)) self.assertEqual(self.senator, possible_speakers[0])
def create_entries_from_data_and_source( cls, data, source ): """Create the needed sitting and entries""" venue = Venue.objects.get( slug=data['meta']['venue'] ) # Joint Sittings can be published by both Houses (identical documents) # prevent the same Sitting being created twice if 'Joint Sitting' in source.name \ and Sitting.objects.filter( venue=venue, source__name=source.name, start_date=source.date, start_time=data['meta'].get('start_time', None) ).exists(): print "skipping duplicate source %s for %s" % (source.name, source.date) return None sitting = Sitting( source = source, venue = venue, start_date = source.date, start_time = data['meta'].get('start_time', None), end_date = source.date, end_time = data['meta'].get('end_time', None), ) sitting.save() with transaction.commit_on_success(): counter = 0 for line in data['transcript']: counter += 1 entry = Entry( sitting = sitting, type = line['type'], page_number = line['page_number'], text_counter = counter, speaker_name = line.get('speaker_name', ''), speaker_title = line.get('speaker_title', ''), content = line['text'], ) entry.save() source.last_processing_success = datetime.datetime.now() source.save() return None
def test_multiple_politician_name_matches_joint_sitting(self): self.source.name = "Joint Sitting of the Parliament" self.source.save() entry = Entry( sitting = self.na_sitting, type = 'text', page_number = 12, text_counter = 4, speaker_name = 'Jones', speaker_title = 'Hon.', content = 'test', ) possible_speakers = entry.possible_matching_speakers( name_matching_algorithm=NAME_SUBSTRING_MATCH) self.assertEqual(2, len(possible_speakers))
def test_multiple_politician_name_matches_senate(self): entry = Entry( sitting = self.senate_sitting, type = 'text', page_number = 12, text_counter = 4, speaker_name = 'Jones', speaker_title = 'Hon.', content = 'test', ) possible_speakers = entry.possible_matching_speakers( name_matching_algorithm=NAME_SUBSTRING_MATCH) self.assertEqual(1, len(possible_speakers)) self.assertEqual( self.senator, possible_speakers[0] )
def test_exclude_hidden_profiles(self): self.senator.hidden = True self.senator.save() entry = Entry( sitting=self.senate_sitting, type='text', page_number=12, text_counter=4, speaker_name='Jones', speaker_title='Hon.', content='test', ) possible_speakers = entry.possible_matching_speakers( name_matching_algorithm=NAME_SUBSTRING_MATCH) self.assertEqual(1, len(possible_speakers)) self.assertEqual(self.mp, possible_speakers[0])
def test_exclude_hidden_profiles(self): self.senator.hidden = True self.senator.save() entry = Entry( sitting = self.senate_sitting, type = 'text', page_number = 12, text_counter = 4, speaker_name = 'Jones', speaker_title = 'Hon.', content = 'test', ) possible_speakers = entry.possible_matching_speakers( name_matching_algorithm=NAME_SUBSTRING_MATCH) self.assertEqual(1, len(possible_speakers)) self.assertEqual( self.mp, possible_speakers[0] )
def create_entries_from_data_and_source( cls, data, source ): """Create the needed sitting and entries""" venue = Venue.objects.get( slug=data['meta']['venue'] ) sitting = Sitting( source = source, venue = venue, start_date = source.date, start_time = data['meta'].get('start_time', None), end_date = source.date, end_time = data['meta'].get('end_time', None), ) sitting.save() with transaction.commit_on_success(): counter = 0 for line in data['transcript']: counter += 1 entry = Entry( sitting = sitting, type = line['type'], page_number = line['page_number'], text_counter = counter, speaker_name = line.get('speaker_name', ''), speaker_title = line.get('speaker_title', ''), content = line['text'], ) entry.save() source.last_processing_success = datetime.datetime.now() source.save() return None
def test_alias_match_score(self): self.assertEqual( Entry().alias_match_score('Mr Bob Smith', 'Mr Bob Smith'), 3) self.assertEqual(Entry().alias_match_score('Mr Bob Smith', 'Mr Smith'), 2) self.assertEqual( Entry().alias_match_score('Mr Bob Smith', 'Bob Smith'), 2) self.assertEqual(Entry().alias_match_score('Mr Bob Smith', 'Bob'), 1) self.assertEqual(Entry().alias_match_score('Bob Smith', 'Smith, Bob'), 2) self.assertEqual( Entry().alias_match_score('Mr Bob Smith', 'Miss Alice Jones'), 0)
def test_assign_speaker_names(self): """Test that the speaker names are assigned as expected""" # This should really be in a separate file as it is not related to the # Kenya parser, but keeping it here for now as it is a step in the # parsing flow that is being tested. # set up the entries source = self._create_source_and_load_test_json_to_entries() entry_qs = Entry.objects.all() unassigned_aliases_qs = Alias.objects.all().unassigned() # check that none of the speakers are assigned self.assertEqual( entry_qs.unassigned_speeches().count(), 31 ) # Assign speakers Entry.assign_speakers(name_matching_algorithm=settings.HANSARD_NAME_MATCHING_ALGORITHM) # check that none of the speakers got assigned - there are no entries in the database self.assertEqual( entry_qs.unassigned_speeches().count(), 31 ) self.assertEqual( unassigned_aliases_qs.count(), 11 ) # print entry_qs.unassigned_speaker_names() # Add an mp that should match but don't make an mp - no match james_gabbow = Person.objects.create( legal_name = 'James Gabbow', slug = 'james-gabbow', ) Entry.assign_speakers(name_matching_algorithm=settings.HANSARD_NAME_MATCHING_ALGORITHM) self.assertEqual( entry_qs.unassigned_speeches().count(), 31 ) self.assertEqual( unassigned_aliases_qs.count(), 11 ) # create the position - check matched mp = PositionTitle.objects.create( name = 'Member of Parliament', slug = 'mp', ) Position.objects.create( person = james_gabbow, title = mp, start_date = ApproximateDate( year=2011, month=1, day = 1 ), end_date = ApproximateDate( future=True ), category = 'political', ) Entry.assign_speakers(name_matching_algorithm=settings.HANSARD_NAME_MATCHING_ALGORITHM) self.assertEqual( entry_qs.unassigned_speeches().count(), 26 ) self.assertEqual( unassigned_aliases_qs.count(), 10 ) # Add a nominated MP and check it is matched nominated_politician = PositionTitle.objects.create( name='Nominated MP', slug='nominated-member-parliament', ) calist_mwatela = Person.objects.create( legal_name='Calist Mwatela', slug='calist-mwatela', ) Position.objects.create( person = calist_mwatela, title = nominated_politician, start_date = ApproximateDate( year=2011, month=1, day = 1 ), end_date = ApproximateDate( future=True ), category = 'political', ) Entry.assign_speakers(name_matching_algorithm=settings.HANSARD_NAME_MATCHING_ALGORITHM) self.assertEqual( entry_qs.unassigned_speeches().count(), 24 ) self.assertEqual( unassigned_aliases_qs.count(), 9 ) # Add an mp that is no longer current, check not matched bob_musila = Person.objects.create( legal_name = 'Bob Musila', slug = 'bob-musila', ) Position.objects.create( person = james_gabbow, title = mp, start_date = ApproximateDate( year=2007, month=1, day = 1 ), end_date = ApproximateDate( year=2009, month=1, day = 1 ), category = 'political', ) Entry.assign_speakers(name_matching_algorithm=settings.HANSARD_NAME_MATCHING_ALGORITHM) self.assertEqual( entry_qs.unassigned_speeches().count(), 24 ) self.assertEqual( unassigned_aliases_qs.count(), 9 ) # Add a name to the aliases and check it is matched betty_laboso = Person.objects.create( legal_name = 'Betty Laboso', slug = 'betty-laboso', ) betty_laboso_alias = Alias.objects.get(alias = 'Dr. Laboso') betty_laboso_alias.person = betty_laboso betty_laboso_alias.save() Entry.assign_speakers(name_matching_algorithm=settings.HANSARD_NAME_MATCHING_ALGORITHM) self.assertEqual( entry_qs.unassigned_speeches().count(), 22 ) self.assertEqual( unassigned_aliases_qs.count(), 8 ) # Add a name to alias that should be ignored, check not matched but not listed in names any more prof_kaloki_alias = Alias.objects.get( alias = 'Prof. Kaloki') prof_kaloki_alias.ignored = True prof_kaloki_alias.save() Entry.assign_speakers(name_matching_algorithm=settings.HANSARD_NAME_MATCHING_ALGORITHM) self.assertEqual( entry_qs.unassigned_speeches().count(), 22 ) self.assertEqual( unassigned_aliases_qs.count(), 7 ) # Add all remaining names to alias and check that all matched for alias in unassigned_aliases_qs.all(): alias.person = betty_laboso alias.save() Entry.assign_speakers(name_matching_algorithm=settings.HANSARD_NAME_MATCHING_ALGORITHM) self.assertEqual( entry_qs.unassigned_speeches().count(), 8 ) self.assertEqual( unassigned_aliases_qs.count(), 0 )
def handle_noargs(self, **options): Entry.assign_speakers()
def test_possible_matching_speakers(self): source = Source( name='Test source', url='http://example.com/foo/bar/testing', date=datetime.date(2011, 1, 3), ) venue = Venue( slug='test-venue', name='Test Venue', ) sitting = Sitting( start_date=datetime.date(2011, 1, 2), source=source, venue=venue, ) entry = Entry(sitting=sitting, ) james_smith = Person.objects.create( legal_name='James Smith', slug='james-smith', ) james_smith2 = Person.objects.create( title='Mr', legal_name='Bob Smith James', slug='james-smith2', ) mp = PositionTitle.objects.create( name='Member of Parliament', slug='mp', ) Position.objects.create( person=james_smith, title=mp, start_date=ApproximateDate(year=2011, month=1, day=1), end_date=ApproximateDate(future=True), category='political', ) Position.objects.create( person=james_smith2, title=mp, start_date=ApproximateDate(year=2011, month=1, day=1), end_date=ApproximateDate(future=True), category='political', ) entry.speaker_name = 'James Smith' speakers = entry.possible_matching_speakers( name_matching_algorithm=NAME_SUBSTRING_MATCH) self.assertListEqual(list(speakers), [james_smith]) entry.speaker_name = 'Mr Smith' speakers = entry.possible_matching_speakers( name_matching_algorithm=NAME_SUBSTRING_MATCH) self.assertItemsEqual(speakers, (james_smith, james_smith2)) speakers = entry.possible_matching_speakers( name_matching_algorithm=NAME_SET_INTERSECTION_MATCH) self.assertListEqual(list(speakers), [james_smith2]) entry.speaker_name = 'Mr James Smith' speakers = entry.possible_matching_speakers( name_matching_algorithm=NAME_SUBSTRING_MATCH) self.assertListEqual(list(speakers), [james_smith]) speakers = entry.possible_matching_speakers( name_matching_algorithm=NAME_SET_INTERSECTION_MATCH) self.assertListEqual(list(speakers), [james_smith2, james_smith])
def test_assign_speaker_names(self): """Test that the speaker names are assigned as expected""" # This should really be in a separate file as it is not related to the # Kenya parser, but keeping it here for now as it is a step in the # parsing flow that is being tested. # set up the entries source = self._create_source_and_load_test_json_to_entries() entry_qs = Entry.objects.all() unassigned_aliases_qs = Alias.objects.all().unassigned() # check that none of the speakers are assigned self.assertEqual(entry_qs.unassigned_speeches().count(), 31) # Assign speakers Entry.assign_speakers() # check that none of the speakers got assigned - there are no entries in the database self.assertEqual(entry_qs.unassigned_speeches().count(), 31) self.assertEqual(unassigned_aliases_qs.count(), 11) # print entry_qs.unassigned_speaker_names() # Add an mp that should match but don't make an mp - no match james_gabbow = Person.objects.create( legal_name='James Gabbow', slug='james-gabbow', ) Entry.assign_speakers() self.assertEqual(entry_qs.unassigned_speeches().count(), 31) self.assertEqual(unassigned_aliases_qs.count(), 11) # create the position - check matched mp = PositionTitle.objects.create( name='Member of Parliament', slug='mp', ) Position.objects.create( person=james_gabbow, title=mp, start_date=ApproximateDate(year=2011, month=1, day=1), end_date=ApproximateDate(future=True), ) Entry.assign_speakers() self.assertEqual(entry_qs.unassigned_speeches().count(), 26) self.assertEqual(unassigned_aliases_qs.count(), 10) # Add a nominated MP and check it is matched nominated_politician = PositionTitle.objects.create( name='Nominated MP', slug='nominated-member-parliament', ) calist_mwatela = Person.objects.create( legal_name='Calist Mwatela', slug='calist-mwatela', ) Position.objects.create( person=calist_mwatela, title=nominated_politician, start_date=ApproximateDate(year=2011, month=1, day=1), end_date=ApproximateDate(future=True), ) Entry.assign_speakers() self.assertEqual(entry_qs.unassigned_speeches().count(), 24) self.assertEqual(unassigned_aliases_qs.count(), 9) # Add an mp that is no longer current, check not matched bob_musila = Person.objects.create( legal_name='Bob Musila', slug='bob-musila', ) Position.objects.create( person=james_gabbow, title=mp, start_date=ApproximateDate(year=2007, month=1, day=1), end_date=ApproximateDate(year=2009, month=1, day=1), ) Entry.assign_speakers() self.assertEqual(entry_qs.unassigned_speeches().count(), 24) self.assertEqual(unassigned_aliases_qs.count(), 9) # Add a name to the aliases and check it is matched betty_laboso = Person.objects.create( legal_name='Betty Laboso', slug='betty-laboso', ) betty_laboso_alias = Alias.objects.get(alias='Dr. Laboso') betty_laboso_alias.person = betty_laboso betty_laboso_alias.save() Entry.assign_speakers() self.assertEqual(entry_qs.unassigned_speeches().count(), 22) self.assertEqual(unassigned_aliases_qs.count(), 8) # Add a name to alias that should be ignored, check not matched but not listed in names any more prof_kaloki_alias = Alias.objects.get(alias='Prof. Kaloki') prof_kaloki_alias.ignored = True prof_kaloki_alias.save() Entry.assign_speakers() self.assertEqual(entry_qs.unassigned_speeches().count(), 22) self.assertEqual(unassigned_aliases_qs.count(), 7) # Add all remaining names to alias and check that all matched for alias in unassigned_aliases_qs.all(): alias.person = betty_laboso alias.save() Entry.assign_speakers() self.assertEqual(entry_qs.unassigned_speeches().count(), 8) self.assertEqual(unassigned_aliases_qs.count(), 0)
def handle_noargs(self, **options): algorithm = settings.HANSARD_NAME_MATCHING_ALGORITHM Entry.assign_speakers(name_matching_algorithm=algorithm)