def _merge_matches_across_cycles(matching_views, org_id, given_state_id, StateClass): """ This is a helper method for match_merge_link(). Given a QS of matching -Views, group them by Cycle. Merge the corresponding -States of each group with priority given based on most recent AuditLog. If the given -View/-State has matches in its own Cycle, AuditLogs are still used to determine merge order, but overarching precedence is given to the provided -View's -State. The count of merges as well as the target -State ID is returned. The target -State ID is either the given -State ID or the merged -State ID of merges involving the given -State ID. """ # Group matching -Views by Cycle and capture state_ids to be merged # For the purpose of merging, we only care if match_count is greater than 1. states_to_merge = matching_views.values('cycle_id').\ annotate(state_ids=ArrayAgg('state_id'), match_count=Count('id')).\ filter(match_count__gt=1).\ values_list('state_ids', flat=True) target_state_id = given_state_id count = 0 for state_ids in states_to_merge: ordered_ids = list( StateClass.objects.filter( id__in=state_ids).order_by('updated').values_list('id', flat=True)) if given_state_id in ordered_ids: # If the given -State ID is included, give it precedence and # capture resulting merged_state ID to be returned ordered_ids.remove(given_state_id) ordered_ids.append(given_state_id) merged_state = merge_states_with_views(ordered_ids, org_id, 'System Match', StateClass) target_state_id = merged_state.id else: merge_states_with_views(ordered_ids, org_id, 'System Match', StateClass) count += len(ordered_ids) return count, target_state_id
def whole_org_match_merge(org_id): """ Scope: all PropertyViews and TaxLotViews for an Org. Algorithm: - Start with PropertyViews then repeat for TaxLotViews - For each Cycle, - Looking at the corresponding -States attached to these -Views,... - Disregard/ignore any -States where all matching criteria is None (likely a subquery or extra exclude). - Group together IDs of -States that match each other. - For each group of size larger than 1, run manual merging logic so that there's only one record left but make the -AuditLog a "System Match". """ summary = { 'PropertyState': { 'merged_count': 0, 'new_merged_state_ids': [] }, 'TaxLotState': { 'merged_count': 0, 'new_merged_state_ids': [] }, } for StateClass in (PropertyState, TaxLotState): ViewClass = PropertyView if StateClass == PropertyState else TaxLotView column_names = matching_criteria_column_names(org_id, StateClass.__name__) cycle_ids = Cycle.objects.filter(organization_id=org_id).values_list('id', flat=True) for cycle_id in cycle_ids: existing_cycle_views = ViewClass.objects.filter(cycle_id=cycle_id) matched_id_groups = StateClass.objects.\ filter(id__in=Subquery(existing_cycle_views.values('state_id'))).\ exclude(**empty_criteria_filter(org_id, StateClass)).\ values(*column_names).\ annotate(matched_ids=ArrayAgg('id'), matched_count=Count('id')).\ values_list('matched_ids', flat=True).\ filter(matched_count__gt=1) for state_ids in matched_id_groups: state_ids.sort() # Ensures priority given to most recently uploaded record merged_state = merge_states_with_views(state_ids, org_id, 'System Match', StateClass) summary[StateClass.__name__]['merged_count'] += len(state_ids) summary[StateClass.__name__]['new_merged_state_ids'].append(merged_state.id) return summary
def match_merge_in_cycle(view_id, StateClassName): """ Given a -View ID, this method matches and merges for the related -State. Match-eligible -States are scoped to those associated with -Views within the same Cycle. If the -State associated with the -View doesn't have any matching criteria values populated, the -State is not eligible for a match merge. """ if StateClassName == 'PropertyState': StateClass = PropertyState ViewClass = PropertyView elif StateClassName == 'TaxLotState': StateClass = TaxLotState ViewClass = TaxLotView view = ViewClass.objects.get(pk=view_id) org_id = view.state.organization_id # Check if associated -State has empty matching criteria. if StateClass.objects.filter(pk=view.state_id, **empty_criteria_filter(org_id, StateClass)).exists(): return 0, None matching_criteria = matching_filter_criteria(org_id, StateClassName, view.state) views_in_cycle = ViewClass.objects.filter(cycle_id=view.cycle_id) state_matches = StateClass.objects.filter( pk__in=Subquery(views_in_cycle.values('state_id')), **matching_criteria ).exclude(pk=view.state_id) state_ids = list( state_matches.order_by('updated').values_list('id', flat=True) ) state_ids.append(view.state_id) # Excluded above and appended to give merge precedence count = len(state_ids) if count > 1: # The following merge action ignores merge protection and prioritizes -States by most recent AuditLog merged_state = merge_states_with_views(state_ids, org_id, 'System Match', StateClass) view_id = ViewClass.objects.get(state_id=merged_state.id).id return count, view_id elif count == 1: return 0, None
def states_to_views(unmatched_state_ids, org, cycle, StateClass): """ The purpose of this method is to take incoming -States and, apply them to a -View. In the process of doing so, -States could be flagged for "deletion" (and not applied to a -View), merged with existing -States, or found to be brand new. Regardless, the goal is to ultimately associate -States to -Views. For incoming -States needing to be matched to an existing -State, merge them and take the existing -State's -View to be the -View for the new merged state. For directly promote-able -States, a new -View and canonical object (Property or TaxLot) are created for it. :param unmatched_states: list :param org: Organization object :param cycle: Cycle object :param StateClass: PropertyState or TaxLotState :return: processed_views, duplicate_count, new + matched counts """ table_name = StateClass.__name__ if table_name == 'PropertyState': ViewClass = PropertyView elif table_name == 'TaxLotState': ViewClass = TaxLotView # Identify existing used -States existing_cycle_views = ViewClass.objects.filter(cycle_id=cycle) existing_states = StateClass.objects.filter( pk__in=Subquery(existing_cycle_views.values('state_id'))) # Apply DATA_STATE_DELETE to incoming duplicate -States of existing -States in Cycle duplicate_states = StateClass.objects.filter( pk__in=unmatched_state_ids, hash_object__in=Subquery(existing_states.values('hash_object'))) duplicate_count = duplicate_states.update(data_state=DATA_STATE_DELETE) column_names = matching_criteria_column_names(org.id, table_name) # For the remaining incoming -States (filtering those duplicates), identify # -States with all matching criteria being None. These aren't eligible for matching. empty_matching_criteria = empty_criteria_filter(StateClass, column_names) promote_states = StateClass.objects.filter( pk__in=unmatched_state_ids, **empty_matching_criteria).exclude( pk__in=Subquery(duplicate_states.values('id'))) # Identify and filter out -States that have been "handled". handled_states = promote_states | duplicate_states unmatched_states = StateClass.objects.filter( pk__in=unmatched_state_ids).exclude( pk__in=Subquery(handled_states.values('id'))) # For the remaining -States, search for a match within the -States that are attached to -Views. # If one match is found, pass that along. # If multiple matches are found, merge them together, pass along the resulting record. # Otherwise, add current -State to be promoted as is. merged_between_existing_count = 0 merge_state_pairs = [] for state in unmatched_states: matching_criteria = matching_filter_criteria(state, column_names) existing_state_matches = StateClass.objects.filter(pk__in=Subquery( existing_cycle_views.values('state_id')), **matching_criteria) count = existing_state_matches.count() if count > 1: merged_between_existing_count += count existing_state_ids = list( existing_state_matches.order_by('updated').values_list( 'id', flat=True)) # The following merge action ignores merge protection and prioritizes -States by most recent AuditLog merged_state = merge_states_with_views(existing_state_ids, org.id, 'System Match', StateClass) merge_state_pairs.append((merged_state, state)) elif count == 1: merge_state_pairs.append((existing_state_matches.first(), state)) else: promote_states = promote_states | StateClass.objects.filter( pk=state.id) # Process -States into -Views either directly (promoted_ids) or post-merge (merge_state_pairs). _log.debug("There are %s merge_state_pairs and %s promote_states" % (len(merge_state_pairs), promote_states.count())) priorities = Column.retrieve_priorities(org.pk) processed_views = [] promoted_ids = [] merged_state_ids = [] try: with transaction.atomic(): for state_pair in merge_state_pairs: existing_state, newer_state = state_pair existing_view = ViewClass.objects.get( state_id=existing_state.id) # Merge -States and assign new/merged -State to existing -View merged_state = save_state_match(existing_state, newer_state, priorities) existing_view.state = merged_state existing_view.save() processed_views.append(existing_view) merged_state_ids.append(merged_state.id) for state in promote_states: promoted_ids.append(state.id) created_view = state.promote(cycle) processed_views.append(created_view) except IntegrityError as e: raise IntegrityError("Could not merge results with error: %s" % (e)) new_count = len(promoted_ids) # update merge_state while excluding any states that were a product of a previous, file-inclusive merge StateClass.objects.filter(pk__in=promoted_ids).exclude( merge_state=MERGE_STATE_MERGED).update(merge_state=MERGE_STATE_NEW) matched_count = StateClass.objects.filter(pk__in=merged_state_ids).update( data_state=DATA_STATE_MATCHING, merge_state=MERGE_STATE_MERGED) return list( set(processed_views) ), duplicate_count, new_count, matched_count, merged_between_existing_count
def whole_org_match_merge_link(org_id, state_class_name, proposed_columns=[]): """ For a given organization, run a match merge round for each cycle in isolation. Afterwards, run a match link round across all cycles at once. In this context, a Property/TaxLot Set refers to the -State, canonical record, and -View records associated by the -View. Algorithm - Run for either Property Sets or for TaxLot Sets: For each Cycle, run match and merges. - Focus on -States associated with -Views in this Cycle. - Ignore -States where all matching criteria is None. - Group -State IDs by whether they match each other. - Ignore each groups of size size 1 (not matched). - For each remaining group, run merge logic so that there's only one Set left. Any labels, notes, pairings, and meters are transferred to and persisted in this Set. Across all Cycles, run match and links. - Focus on all -States and canonical records associated to -Views in this organization. - Identify canonical records that currently have no links. These are unaffected during this process if the record remains unlinked. Also, these are canonical records that can potentially be reused. - Scope the next steps to ignore -Views with -States where all matching criteria is None. - Create link groups of canonical IDs and -View IDs according to whether their associated -States match each other. - Ignore groups of size 1 where the single member was previously unlinked as well. - For each remaining group, apply a new canonical record to each of -Views in this group. Any meters are transferred to this new canonical record. - For any records that had empty (all None) matching criteria values, disassociate any previous links by applying a new canonical record to each. - Delete any unused canonical records. """ summary = { 'PropertyState': { 'merged_count': 0, 'linked_sets_count': 0, }, 'TaxLotState': { 'merged_count': 0, 'linked_sets_count': 0, }, } cycle_ids = Cycle.objects.filter(organization_id=org_id).values_list( 'id', flat=True) if state_class_name == 'PropertyState': StateClass = PropertyState ViewClass = PropertyView CanonicalClass = Property elif state_class_name == 'TaxLotState': StateClass = TaxLotState ViewClass = TaxLotView CanonicalClass = TaxLot if proposed_columns: # Use column names as given (replacing address_line_1 with normalized_address) column_names = [ column_name if column_name != 'address_line_1' else 'normalized_address' for column_name in proposed_columns ] preview_run = True else: column_names = matching_criteria_column_names(org_id, state_class_name) preview_run = False empty_matching_criteria = empty_criteria_filter(StateClass, column_names) with transaction.atomic(): # Match merge within each Cycle for cycle_id in cycle_ids: view_in_cycle = ViewClass.objects.filter(cycle_id=cycle_id) matched_id_groups = StateClass.objects.\ filter(id__in=Subquery(view_in_cycle.values('state_id'))).\ exclude(**empty_matching_criteria).\ values(*column_names).\ annotate(matched_ids=ArrayAgg('id'), matched_count=Count('id')).\ values_list('matched_ids', flat=True).\ filter(matched_count__gt=1) for state_ids in matched_id_groups: ordered_ids = list( StateClass.objects.filter( id__in=state_ids).order_by('updated').values_list( 'id', flat=True)) merge_states_with_views(ordered_ids, org_id, 'System Match', StateClass) summary[StateClass.__name__]['merged_count'] += len(state_ids) # Match link across the whole Organization # Append 'state__' to dict keys used for filtering so that filtering can be done across associations state_appended_col_names = { 'state__' + col_name for col_name in column_names } state_appended_empty_matching_criteria = { 'state__' + col_name: v for col_name, v in empty_matching_criteria.items() } canonical_id_col = 'property_id' if StateClass == PropertyState else 'taxlot_id' # Looking at all -Views in Org across Cycles org_views = ViewClass.objects.\ filter(cycle_id__in=cycle_ids).\ select_related('state') # Identify all canonical_ids that are currently used once and are potentially reusable reusable_canonical_ids = org_views.\ values(canonical_id_col).\ annotate(use_count=Count(canonical_id_col)).\ values_list(canonical_id_col, flat=True).\ filter(use_count=1) # Ignoring -Views associated to -States with empty matching critieria, group by columns link_groups = org_views.\ exclude(**state_appended_empty_matching_criteria).\ values(*state_appended_col_names).\ annotate( canonical_ids=ArrayAgg(canonical_id_col), view_ids=ArrayAgg('id'), link_count=Count('id') ).\ values_list('canonical_ids', 'view_ids', 'link_count') unused_canonical_ids = [] for canonical_ids, view_ids, link_count in link_groups: # If the canonical record was unlinked and is still unlinked, do nothing if link_count == 1 and canonical_ids[0] in reusable_canonical_ids: continue # Otherwise, create a new canonical record, copy meters if applicable, and apply the new record to old -Views new_record = CanonicalClass.objects.create(organization_id=org_id) if CanonicalClass == Property: canonical_ids.sort( reverse=True ) # Ensures priority given by most recently created canonical record for canonical_id in canonical_ids: new_record.copy_meters(canonical_id, source_persists=True) ViewClass.objects.filter(id__in=view_ids).update( **{canonical_id_col: new_record.id}) summary[StateClass.__name__]['linked_sets_count'] += 1 unused_canonical_ids += canonical_ids # For records with empty criteria and without reusable canonical IDs, apply a new ID. empty_criteria_views = ViewClass.objects.\ select_related('state').\ filter(cycle_id__in=cycle_ids, **state_appended_empty_matching_criteria).\ exclude(**{canonical_id_col + "__in": reusable_canonical_ids}) for view in empty_criteria_views: # Create a new canonical record, copy meters if applicable, and apply the new record to old -Views new_record = CanonicalClass.objects.create(organization_id=org_id) if CanonicalClass == Property: new_record.copy_meters(getattr(view, canonical_id_col), source_persists=False) setattr(view, canonical_id_col, new_record.id) view.save() # Also delete these unusable canonical records unused_canonical_ids += empty_criteria_views.values_list( canonical_id_col, flat=True) # Delete canonical records that are no longer used. CanonicalClass.objects.filter(id__in=unused_canonical_ids).delete() # If this was a preview run, capture results here and rollback. if preview_run: if state_class_name == 'PropertyState': summary = properties_across_cycles(org_id, -1, cycle_ids) else: summary = taxlots_across_cycles(org_id, -1, cycle_ids) transaction.set_rollback(True) return summary