def _save_raw_data_chunk(chunk, file_pk, prog_key, increment, *args, **kwargs): """Save the raw data to the database.""" import_file = ImportFile.objects.get(pk=file_pk) # Save our "column headers" and sample rows for F/E. source_type = get_source_type(import_file) for c in chunk: raw_bs = BuildingSnapshot() raw_bs.import_file = import_file raw_bs.extra_data = c raw_bs.source_type = source_type # We require a save to get our PK # We save here to set our initial source PKs. raw_bs.save() super_org = import_file.import_record.super_organization raw_bs.super_organization = super_org set_initial_sources(raw_bs) raw_bs.save() # Indicate progress increment_cache(prog_key, increment) return True
def test_increment_cache(self): """Sum our progress by increments properly.""" expected = 25.0 test_key = make_key('increment_test') increment = 25.0 # Fresh increment, this initializes the value. increment_cache(test_key, increment) self.assertEqual(float(get_cache(test_key)['progress']), expected) # Increment an existing key increment_cache(test_key, increment) expected = 50.0 self.assertEqual(float(get_cache(test_key)['progress']), expected) # This should put us well over 100.0 in incrementation w/o bounds check. for i in range(10): increment_cache(test_key, increment) expected = 100.0 self.assertEqual(float(get_cache(test_key)['progress']), expected)
def _delete_organization_taxlot_state_chunk(del_ids, prog_key, increment, org_pk, *args, **kwargs): """deletes a list of ``del_ids`` and increments the cache""" TaxLotState.objects.filter(organization_id=org_pk, pk__in=del_ids).delete() increment_cache(prog_key, increment * 100)
def _delete_organization_property_state_chunk(del_ids, prog_key, increment, org_pk, *args, **kwargs): """deletes a list of ``del_ids`` and increments the cache""" PropertyState.objects.filter(pk__in=del_ids).delete() increment_cache(prog_key, increment * 100)
def fake_func(import_file_pk): increment_cache(key, increment)
def map_row_chunk(chunk, file_pk, source_type, prog_key, increment, *args, **kwargs): """Does the work of matching a mapping to a source type and saving :param chunk: list of dict of str. One row's worth of parse data. :param file_pk: int, the PK for an ImportFile obj. :param source_type: int, represented by either ASSESSED_RAW, or PORTFOLIO_RAW. :param prog_key: string, key of the progress key :param increment: double, value by which to increment progress key :param cleaner: (optional), the cleaner class you want to send to mapper.map_row. (e.g. turn numbers into floats.). :param raw_ids: (optional kwarg), the list of ids in chunk order. """ import_file = ImportFile.objects.get(pk=file_pk) save_type = PORTFOLIO_BS if source_type == ASSESSED_RAW: save_type = ASSESSED_BS concats = [] org = Organization.objects.get( pk=import_file.import_record.super_organization.pk ) mapping, concats = get_column_mappings(org) map_cleaner = _build_cleaner(org) # For those column mapping which are not db columns, we # need to let MCM know that we apply our mapping function to those. apply_columns = [] mappable_columns = get_mappable_columns() for item in mapping: if mapping[item] not in mappable_columns: apply_columns.append(item) apply_func = apply_data_func(mappable_columns) for row in chunk: model = mapper.map_row( row, mapping, BuildingSnapshot, cleaner=map_cleaner, concat=concats, apply_columns=apply_columns, apply_func=apply_func, *args, **kwargs ) model.import_file = import_file model.source_type = save_type model.clean() model.super_organization = import_file.import_record.super_organization model.save() if model: # Make sure that we've saved all of the extra_data column names save_column_names(model, mapping=mapping) increment_cache(prog_key, increment)
def _delete_organization_buildings_chunk(del_ids, prog_key, increment, org_pk, *args, **kwargs): """deletes a list of ``del_ids`` and increments the cache""" qs = BuildingSnapshot.objects.filter(super_organization=org_pk) qs.filter(pk__in=del_ids).delete() increment_cache(prog_key, increment * 100)
def _match_buildings(file_pk, user_pk): """ngram search against all of the canonical_building snapshots for org.""" # assert True min_threshold = settings.MATCH_MIN_THRESHOLD import_file = ImportFile.objects.get(pk=file_pk) prog_key = get_prog_key('match_buildings', file_pk) org = Organization.objects.filter(users=import_file.import_record.owner)[0] test = '' unmatched_buildings = find_unmatched_buildings(import_file) duplicates = [] newly_matched_building_pks = [] # Filter out matches based on ID. # if the match is a duplicate of other existing data add it to a list # and indicate which existing record it is a duplicate of for unmatched in unmatched_buildings: try: match = handle_id_matches(unmatched, import_file, user_pk) except DuplicateDataError as e: duplicates.append(unmatched.pk) unmatched.duplicate_id = e.id unmatched.save() continue if match: newly_matched_building_pks.extend([match.pk, unmatched.pk]) # Remove any buildings we just did exact ID matches with. unmatched_buildings = unmatched_buildings.exclude( pk__in=newly_matched_building_pks ).values_list(*BS_VALUES_LIST) # If we don't find any unmatched buildings, there's nothing left to do. if not unmatched_buildings: _finish_matching(import_file, prog_key) return # here we deal with duplicates unmatched_buildings = unmatched_buildings.exclude(pk__in=duplicates).values_list(*BS_VALUES_LIST) if not unmatched_buildings: _finish_matching(import_file, prog_key) return # here we are going to normalize the addresses to match on address_1 field, this is not ideal because you could match on two locations with same address_1 but different city # unmatched_normalized_addresses=[] unmatched_normalized_addresses = [ _normalize_address_str(unmatched[4]) for unmatched in unmatched_buildings ] # Here we want all the values not related to the BS id for doing comps. # dont do this now # unmatched_ngrams = [ # _stringify(list(values)[1:]) for values in unmatched_buildings # ] canonical_buildings = find_canonical_building_values(org) if not canonical_buildings: # There are no canonical_buildings for this organization, all unmatched # buildings will then become canonicalized. hydrated_unmatched_buildings = BuildingSnapshot.objects.filter( pk__in=[item[0] for item in unmatched_buildings] ) num_unmatched = len(unmatched_normalized_addresses) or 1 increment = 1.0 / num_unmatched * 100 for (i, unmatched) in enumerate(hydrated_unmatched_buildings): initialize_canonical_building(unmatched, user_pk) if i % 100 == 0: increment_cache(prog_key, increment * 100) _finish_matching(import_file, prog_key) return # This allows us to retrieve the PK for a given NGram after a match. can_rev_idx = { _normalize_address_str(value[4]): value[0] for value in canonical_buildings } # (SD) This loads up an ngram object with all the canonical buildings. The values are the lists of identifying data for each building # (SD) the stringify is given all but the first item in the values list and it concatenates each item with a space in the middle # we no longer need to # n = ngram.NGram( # [_stringify(values[1:]) for values in canonical_buildings] # ) # here we are going to normalize the addresses to match on address_1 field, this is not ideal because you could match on two locations with same address_1 but different city canonical_buildings_addresses = [ _normalize_address_str(values[4]) for values in canonical_buildings ] # For progress tracking # sd we now use the address # num_unmatched = len(unmatched_ngrams) or 1 num_unmatched = len(unmatched_normalized_addresses) or 1 # this code below seemed to be unclear when I was debugging so I added the brackets increment = (1.0 / num_unmatched) * 100 # PKs when we have a match. import_file.mapping_completion = 0 import_file.save() # this section spencer changed to make the exact match for i, un_m_address in enumerate(unmatched_normalized_addresses): results = _find_matches(un_m_address, canonical_buildings_addresses) if results: handle_results( results, i, can_rev_idx, unmatched_buildings, user_pk ) else: hydrated_building = BuildingSnapshot.objects.get( pk=unmatched_buildings[i][0] ) initialize_canonical_building(hydrated_building, user_pk) if i % 100 == 0: increment_cache(prog_key, increment * 100) import_file.mapping_completion += int(increment * 100) import_file.save() _finish_matching(import_file, prog_key) return {'status': 'success'}
def _delete_organization_taxlot_state_chunk(del_ids, prog_key, increment, org_pk, *args, **kwargs): """deletes a list of ``del_ids`` and increments the cache""" qs = TaxLotState.objects.filter(organization_id=org_pk) qs.filter(pk__in=del_ids).delete() increment_cache(prog_key, increment * 100)