def _get_names(self): if self._names is None: self._names = [] fixed_names = set(fix_name(alias.alias) for alias in self.aliases.all()) for name in fixed_names: if self.type in ['politician', 'individual']: self._names.append(PersonName(name)) elif self.type == 'organization': self._names.append(OrganizationName(name)) return self._names
def handle(self, *args, **kwargs): if len(args) > 0: if args[0] in ("trusted", "possible", "full"): display = args[0] else: print "Unexpected argument '%s'. Options are 'trusted' or 'full'" % args[ 0] return else: display = 'full' cursor = connection.cursor() # ORM is way too slow for this on 100,000+ rows. cursor.execute( """ SELECT DISTINCT a.entity_id,a.alias,m.state,m.party,m.seat FROM matchbox_entityalias a LEFT JOIN politician_metadata_latest_cycle_view m ON m.entity_id=a.entity_id LEFT JOIN matchbox_entity e ON m.entity_id=e.id WHERE e.type = %s """, ['politician']) rows = cursor.fetchall() # Add person name classes rows = [(e, PersonName(fix_name(a or "")), s or "", p or "", o or "") for e, a, s, p, o in rows] by_last_name = defaultdict(list) # group all entities by last name for row in rows: by_last_name[row[1].last].append(row) #count = 0 #grand_total = len(rows) totals = defaultdict(int) groups = defaultdict(list) for last_name, entities in by_last_name.iteritems(): #print count, grand_total, last_name, len(entities) #count += len(entities) # for each last name, split enities into a groups of state and federal politicians # this will make all the "left sides" of the matches federal and all the right sides state fed_entities = [ entity for entity in entities if entity[4].startswith('federal') ] state_entities = [ entity for entity in entities if entity[4].startswith('state') ] for eid1, name1, state1, party1, office1 in fed_entities: for eid2, name2, state2, party2, office2 in state_entities: # skip if maximal fuzziness fails if not name1.matches(name2): continue state_checks = { 'same state': state1 == state2, 'diff state': state1 != state2, 'missing one state': (not state1 or not state2) and state1 != state2, 'missing two states': not state1 and not state2, } party_checks = { 'same party': party1 == party2, 'diff party; both 3rd': party1 not in "RD" and party2 not in "RD" and party1 != party2, 'diff party; one 3rd': party1 != party2 and (party1 not in "RD" or party2 not in "RD") and (party1 in "RD" or party2 in "RD"), 'diff party; R or D': party1 != party2 and (party1 in "RD" and party2 in "RD"), } # Check all combinations of name matching conditions all_conditions = ('missing_middle', 'nicknames', 'missing_suffix', 'initials', 'first_as_middle') name_checks = {'exact': name1.matches(name2, exact=True)} def check(conditions): if len(conditions) == 0: return name_checks['exact'] key = ", ".join(conditions) name_checks[key] = name_checks.get( key, (not check(conditions[:-1]) and name1.matches(name2, exact=True, **dict( (c, True) for c in conditions)))) return name_checks[key] def get_minimum_match(): for r in range(len(all_conditions)): for conds in combinations(all_conditions, r): if check(conds): return True get_minimum_match() for n1, c1 in state_checks.iteritems(): if c1: for n2, c2 in party_checks.iteritems(): if c2: for n3, c3 in name_checks.iteritems(): if c3: key = " | ".join((n1, n2, n3)) totals[key] += 1 match = ((eid1, name1.name, state1, party1, office1), (eid2, name2.name, state2, party2, office2)) # names can have multiple aliases which cause duplicate entity matches # don't add these if match not in groups[key]: groups[key].append(match) break if display == 'full': pprint(dict(totals)) for group in sorted(groups.keys()): print group, len(groups[group]) for n1, n2 in groups[group]: print " ", n1, n2 elif display in ('trusted', 'possible'): out = StringIO() writer = csv.writer(out) matches = getattr(self, display) for group in matches: for n1, n2 in groups[group]: if not n1[1] in self.excluded and not n2[ 1] in self.excluded: writer.writerow(n1 + n2) print out.getvalue() out.close()
def handle(self, *args, **kwargs): if len(args) > 0: if args[0] in ("trusted", "possible", "full"): display = args[0] else: print "Unexpected argument '%s'. Options are 'trusted' or 'full'" % args[0] return else: display = 'full' cursor = connection.cursor() # ORM is way too slow for this on 100,000+ rows. cursor.execute(""" SELECT DISTINCT a.entity_id,a.alias,m.state,m.party,m.seat FROM matchbox_entityalias a LEFT JOIN politician_metadata_latest_cycle_view m ON m.entity_id=a.entity_id LEFT JOIN matchbox_entity e ON m.entity_id=e.id WHERE e.type = %s """, ['politician']) rows = cursor.fetchall() # Add person name classes rows = [(e, PersonName(fix_name(a or "")), s or "", p or "", o or "") for e,a,s,p,o in rows] by_last_name = defaultdict(list) # group all entities by last name for row in rows: by_last_name[row[1].last].append(row) #count = 0 #grand_total = len(rows) totals = defaultdict(int) groups = defaultdict(list) for last_name, entities in by_last_name.iteritems(): #print count, grand_total, last_name, len(entities) #count += len(entities) # for each last name, split enities into a groups of state and federal politicians # this will make all the "left sides" of the matches federal and all the right sides state fed_entities = [ entity for entity in entities if entity[4].startswith('federal') ] state_entities = [ entity for entity in entities if entity[4].startswith('state') ] for eid1, name1, state1, party1, office1 in fed_entities: for eid2, name2, state2, party2, office2 in state_entities: # skip if maximal fuzziness fails if not name1.matches(name2): continue state_checks = { 'same state': state1 == state2, 'diff state': state1 != state2, 'missing one state': (not state1 or not state2) and state1 != state2, 'missing two states': not state1 and not state2, } party_checks = { 'same party': party1 == party2, 'diff party; both 3rd': party1 not in "RD" and party2 not in "RD" and party1 != party2, 'diff party; one 3rd': party1 != party2 and (party1 not in "RD" or party2 not in "RD") and (party1 in "RD" or party2 in "RD"), 'diff party; R or D': party1 != party2 and (party1 in "RD" and party2 in "RD"), } # Check all combinations of name matching conditions all_conditions = ('missing_middle', 'nicknames', 'missing_suffix', 'initials', 'first_as_middle') name_checks = { 'exact': name1.matches(name2, exact=True) } def check(conditions): if len(conditions) == 0: return name_checks['exact'] key = ", ".join(conditions) name_checks[key] = name_checks.get(key, ( not check(conditions[:-1]) and name1.matches(name2, exact=True, **dict((c, True) for c in conditions)) )) return name_checks[key] def get_minimum_match(): for r in range(len(all_conditions)): for conds in combinations(all_conditions, r): if check(conds): return True get_minimum_match() for n1, c1 in state_checks.iteritems(): if c1: for n2, c2 in party_checks.iteritems(): if c2: for n3, c3 in name_checks.iteritems(): if c3: key = " | ".join((n1, n2, n3)) totals[key] += 1 match = ( (eid1, name1.name, state1, party1, office1), (eid2, name2.name, state2, party2, office2) ) # names can have multiple aliases which cause duplicate entity matches # don't add these if match not in groups[key]: groups[key].append(match) break if display == 'full': pprint(dict(totals)) for group in sorted(groups.keys()): print group, len(groups[group]) for n1, n2 in groups[group]: print " ", n1, n2 elif display in ('trusted', 'possible'): out = StringIO() writer = csv.writer(out) matches = getattr(self, display) for group in matches: for n1, n2 in groups[group]: if not n1[1] in self.excluded and not n2[1] in self.excluded: writer.writerow(n1 + n2) print out.getvalue() out.close()