Пример #1
0
def _delete_organization_buildings(org_pk, chunk_size=100, *args, **kwargs):
    """Deletes all BuildingSnapshot instances within an organization

    :param org_pk: int, str, the organization pk
    """
    qs = BuildingSnapshot.objects.filter(super_organization=org_pk)
    ids = qs.values_list('id', flat=True)
    deleting_cache_key = get_prog_key(
        'delete_organization_buildings',
        org_pk
    )
    if not ids:
        cache.set(deleting_cache_key, 100)
        return

    # delete the canonical buildings
    can_ids = CanonicalBuilding.objects.filter(
        canonical_snapshot__super_organization=org_pk
    ).values_list('id', flat=True)
    _delete_canonical_buildings.delay(can_ids)

    step = float(chunk_size) / len(ids)
    cache.set(deleting_cache_key, 0)
    tasks = []
    for del_ids in batch(ids, chunk_size):
        # we could also use .s instead of .subtask and not wrap the *args
        tasks.append(
            _delete_organization_buildings_chunk.subtask(
                (del_ids, deleting_cache_key, step, org_pk)
            )
        )
    chord(tasks, interval=15)(finish_delete.subtask([org_pk]))
Пример #2
0
 def destroy(self, request, pk=None):
     """
     Starts a background task to delete an organization and all related data.
     ---
     parameter_strategy: replace
     parameters:
         - name: pk
           type: integer
           description: Organization ID (primary key)
           required: true
           paramType: path
     type:
         status:
             description: success or error
             type: string
             required: true
         progress_key:
             description: ID of background job, for retrieving job progress
             type: string
             required: true
     """
     org_id = pk
     deleting_cache_key = get_prog_key(
         'delete_organization_buildings',
         org_id
     )
     tasks.delete_organization.delay(org_id, deleting_cache_key)
     return JsonResponse({
         'status': 'success',
         'progress': 0,
         'progress_key': deleting_cache_key
     })
Пример #3
0
def _save_raw_data(file_pk, *args, **kwargs):
    """Chunk up the CSV and save data into the DB raw."""
    import_file = ImportFile.objects.get(pk=file_pk)

    if import_file.raw_save_done:
        return {'status': 'warning', 'message': 'raw data already saved'}

    if import_file.source_type == "Green Button Raw":
        return _save_raw_green_button_data(file_pk, *args, **kwargs)

    parser = reader.MCMParser(import_file.local_file)
    cache_first_rows(import_file, parser)
    rows = parser.next()
    import_file.num_rows = 0

    prog_key = get_prog_key('save_raw_data', file_pk)

    tasks = []
    for chunk in batch(rows, 100):
        import_file.num_rows += len(chunk)
        tasks.append(_save_raw_data_chunk.subtask((chunk, file_pk, prog_key)))

    tasks = add_cache_increment_parameter(tasks)
    import_file.num_columns = parser.num_columns()
    import_file.save()

    if tasks:
        chord(tasks, interval=15)(finish_raw_save.subtask([file_pk]))
    else:
        finish_raw_save.task(file_pk)

    return {'status': 'success'}
Пример #4
0
def finish_mapping(results, file_pk):
    import_file = ImportFile.objects.get(pk=file_pk)
    import_file.mapping_done = True
    import_file.save()
    finish_import_record(import_file.import_record.pk)
    prog_key = get_prog_key('map_data', file_pk)
    cache.set(prog_key, 100)
Пример #5
0
 def destroy(self, request, pk=None):
     """
     Starts a background task to delete an organization and all related data.
     ---
     parameter_strategy: replace
     parameters:
         - name: pk
           type: integer
           description: Organization ID (primary key)
           required: true
           paramType: path
     type:
         status:
             description: success or error
             type: string
             required: true
         progress_key:
             description: ID of background job, for retrieving job progress
             type: string
             required: true
     """
     org_id = pk
     deleting_cache_key = get_prog_key(
         'delete_organization_buildings',
         org_id
     )
     tasks.delete_organization.delay(org_id, deleting_cache_key)
     return JsonResponse({
         'status': 'success',
         'progress': 0,
         'progress_key': deleting_cache_key
     })
Пример #6
0
def _delete_organization_buildings(org_pk, chunk_size=100, *args, **kwargs):
    """Deletes all BuildingSnapshot instances within an organization

    :param org_pk: int, str, the organization pk
    """
    qs = BuildingSnapshot.objects.filter(super_organization=org_pk)
    ids = qs.values_list('id', flat=True)
    deleting_cache_key = get_prog_key(
        'delete_organization_buildings',
        org_pk
    )
    if not ids:
        cache.set(deleting_cache_key, 100)
        return

    # delete the canonical buildings
    can_ids = CanonicalBuilding.objects.filter(
        canonical_snapshot__super_organization=org_pk
    ).values_list('id', flat=True)
    _delete_canonical_buildings.delay(can_ids)

    step = float(chunk_size) / len(ids)
    cache.set(deleting_cache_key, 0)
    tasks = []
    for del_ids in batch(ids, chunk_size):
        # we could also use .s instead of .subtask and not wrap the *args
        tasks.append(
            _delete_organization_buildings_chunk.subtask(
                (del_ids, deleting_cache_key, step, org_pk)
            )
        )
    chord(tasks, interval=15)(finish_delete.subtask([org_pk]))
Пример #7
0
def _save_raw_data(file_pk, *args, **kwargs):
    """Chunk up the CSV and save data into the DB raw."""
    import_file = ImportFile.objects.get(pk=file_pk)

    if import_file.raw_save_done:
        return {'status': 'warning', 'message': 'raw data already saved'}

    if import_file.source_type == "Green Button Raw":
        return _save_raw_green_button_data(file_pk, *args, **kwargs)

    parser = reader.MCMParser(import_file.local_file)
    cache_first_rows(import_file, parser)
    rows = parser.next()
    import_file.num_rows = 0

    prog_key = get_prog_key('save_raw_data', file_pk)

    tasks = []
    for chunk in batch(rows, 100):
        import_file.num_rows += len(chunk)
        tasks.append(_save_raw_data_chunk.subtask((chunk, file_pk, prog_key)))

    tasks = add_cache_increment_parameter(tasks)
    import_file.num_columns = parser.num_columns()
    import_file.save()

    if tasks:
        chord(tasks, interval=15)(finish_raw_save.subtask([file_pk]))
    else:
        finish_raw_save.task(file_pk)

    return {'status': 'success'}
Пример #8
0
def get_progress(request):
    """
    Return the progress of the cleansing.
    """

    import_file_id = request.GET.get('import_file_id')
    return get_cache(get_prog_key('get_progress', import_file_id))['progress']
Пример #9
0
def get_progress(request):
    """
    Return the progress of the cleansing.
    """

    import_file_id = request.GET.get("import_file_id")
    return get_cache(get_prog_key("get_progress", import_file_id))["progress"]
Пример #10
0
def finish_mapping(results, file_pk):
    import_file = ImportFile.objects.get(pk=file_pk)
    import_file.mapping_done = True
    import_file.save()
    finish_import_record(import_file.import_record.pk)
    prog_key = get_prog_key('map_data', file_pk)
    cache.set(prog_key, 100)
Пример #11
0
def get_progress(request):
    """
    Return the progress of the cleansing.
    """

    import_file_id = request.GET.get('import_file_id')
    return get_cache(get_prog_key('get_progress', import_file_id))['progress']
Пример #12
0
def _map_data(file_pk, *args, **kwargs):
    """Get all of the raw data and process it using appropriate mapping.
    @lock_and_track returns a progress_key

    :param file_pk: int, the id of the import_file we're working with.

    """
    import_file = ImportFile.objects.get(pk=file_pk)
    # Don't perform this task if it's already been completed.
    if import_file.mapping_done:
        prog_key = get_prog_key('map_data', file_pk)
        cache.set(prog_key, 100)
        return {'status': 'warning', 'message': 'mapping already complete'}

    # If we haven't finished saving, we shouldn't proceed with mapping
    # Re-queue this task.
    if not import_file.raw_save_done:
        map_data.apply_async(args=[file_pk], countdown=60, expires=120)
        return {'status': 'error', 'message': 'waiting for raw data save.'}

    source_type_dict = {
        'Portfolio Raw': PORTFOLIO_RAW,
        'Assessed Raw': ASSESSED_RAW,
        'Green Button Raw': GREEN_BUTTON_RAW,
    }
    source_type = source_type_dict.get(import_file.source_type, ASSESSED_RAW)

    qs = BuildingSnapshot.objects.filter(
        import_file=import_file,
        source_type=source_type,
    ).iterator()

    prog_key = get_prog_key('map_data', file_pk)
    tasks = []
    for chunk in batch(qs, 100):
        serialized_data = [obj.extra_data for obj in chunk]
        tasks.append(map_row_chunk.subtask(
            (serialized_data, file_pk, source_type, prog_key)
        ))

    tasks = add_cache_increment_parameter(tasks)
    if tasks:
        chord(tasks, interval=15)(finish_mapping.subtask([file_pk]))
    else:
        finish_mapping.task(file_pk)

    return {'status': 'success'}
Пример #13
0
def _map_data(file_pk, *args, **kwargs):
    """Get all of the raw data and process it using appropriate mapping.
    @lock_and_track returns a progress_key

    :param file_pk: int, the id of the import_file we're working with.

    """
    import_file = ImportFile.objects.get(pk=file_pk)
    # Don't perform this task if it's already been completed.
    if import_file.mapping_done:
        prog_key = get_prog_key('map_data', file_pk)
        cache.set(prog_key, 100)
        return {'status': 'warning', 'message': 'mapping already complete'}

    # If we haven't finished saving, we shouldn't proceed with mapping
    # Re-queue this task.
    if not import_file.raw_save_done:
        map_data.apply_async(args=[file_pk], countdown=60, expires=120)
        return {'status': 'error', 'message': 'waiting for raw data save.'}

    source_type_dict = {
        'Portfolio Raw': PORTFOLIO_RAW,
        'Assessed Raw': ASSESSED_RAW,
        'Green Button Raw': GREEN_BUTTON_RAW,
    }
    source_type = source_type_dict.get(import_file.source_type, ASSESSED_RAW)

    qs = BuildingSnapshot.objects.filter(
        import_file=import_file,
        source_type=source_type,
    ).iterator()

    prog_key = get_prog_key('map_data', file_pk)
    tasks = []
    for chunk in batch(qs, 100):
        serialized_data = [obj.extra_data for obj in chunk]
        tasks.append(map_row_chunk.subtask(
            (serialized_data, file_pk, source_type, prog_key)
        ))

    tasks = add_cache_increment_parameter(tasks)
    if tasks:
        chord(tasks, interval=15)(finish_mapping.subtask([file_pk]))
    else:
        finish_mapping.task(file_pk)

    return {'status': 'success'}
Пример #14
0
def _save_raw_data(file_pk, *args, **kwargs):
    """Chunk up the CSV and save data into the DB raw."""

    result = {'status': 'success', 'progress': 100}
    prog_key = get_prog_key('save_raw_data', file_pk)
    try:
        import_file = ImportFile.objects.get(pk=file_pk)

        if import_file.raw_save_done:
            result['status'] = 'warning'
            result['message'] = 'Raw data already saved'
            cache.set(prog_key, result)
            return result

        if import_file.source_type == "Green Button Raw":
            return _save_raw_green_button_data(file_pk, *args, **kwargs)

        parser = reader.MCMParser(import_file.local_file)
        cache_first_rows(import_file, parser)
        rows = parser.next()
        import_file.num_rows = 0

        tasks = []
        for chunk in batch(rows, 100):
            import_file.num_rows += len(chunk)
            tasks.append(
                _save_raw_data_chunk.subtask((chunk, file_pk, prog_key)))

        tasks = add_cache_increment_parameter(tasks)
        import_file.num_columns = parser.num_columns()
        import_file.save()

        if tasks:
            chord(tasks, interval=15)(finish_raw_save.subtask([file_pk]))
        else:
            finish_raw_save.task(file_pk)

    except StopIteration:
        result['status'] = 'error'
        result['message'] = 'StopIteration Exception'
        result['stacktrace'] = traceback.format_exc()
    except Error as e:
        result['status'] = 'error'
        result['message'] = 'File Content Error: ' + e.message
        result['stacktrace'] = traceback.format_exc()
    except KeyError as e:
        result['status'] = 'error'
        result['message'] = 'Invalid Column Name: "' + e.message + '"'
        result['stacktrace'] = traceback.format_exc()
    except Exception as e:
        result['status'] = 'error'
        result['message'] = 'Unhandled Error: ' + e.message
        result['stacktrace'] = traceback.format_exc()

    cache.set(prog_key, result)
    return result
Пример #15
0
def _save_raw_data(file_pk, *args, **kwargs):
    """Chunk up the CSV and save data into the DB raw."""

    result = {'status': 'success', 'progress': 100}
    prog_key = get_prog_key('save_raw_data', file_pk)
    try:
        import_file = ImportFile.objects.get(pk=file_pk)

        if import_file.raw_save_done:
            result['status'] = 'warning'
            result['message'] = 'Raw data already saved'
            cache.set(prog_key, result)
            return result

        if import_file.source_type == "Green Button Raw":
            return _save_raw_green_button_data(file_pk, *args, **kwargs)

        parser = reader.MCMParser(import_file.local_file)
        cache_first_rows(import_file, parser)
        rows = parser.next()
        import_file.num_rows = 0

        tasks = []
        for chunk in batch(rows, 100):
            import_file.num_rows += len(chunk)
            tasks.append(_save_raw_data_chunk.subtask((chunk, file_pk, prog_key)))

        tasks = add_cache_increment_parameter(tasks)
        import_file.num_columns = parser.num_columns()
        import_file.save()

        if tasks:
            chord(tasks, interval=15)(finish_raw_save.subtask([file_pk]))
        else:
            finish_raw_save.task(file_pk)

    except StopIteration:
        result['status'] = 'error'
        result['message'] = 'StopIteration Exception'
        result['stacktrace'] = traceback.format_exc()
    except Error as e:
        result['status'] = 'error'
        result['message'] = 'File Content Error: ' + e.message
        result['stacktrace'] = traceback.format_exc()
    except KeyError as e:
        result['status'] = 'error'
        result['message'] = 'Invalid Column Name: "' + e.message + '"'
        result['stacktrace'] = traceback.format_exc()
    except Exception as e:
        result['status'] = 'error'
        result['message'] = 'Unhandled Error: ' + e.message
        result['stacktrace'] = traceback.format_exc()

    cache.set(prog_key, result)
    return result
Пример #16
0
def finish_cleansing(results, file_pk):
    """
    Chord that is called after the cleansing is complete

    :param results:
    :param file_pk: import file primary key
    :return:
    """

    prog_key = get_prog_key("cleanse_data", file_pk)
    cache.set(prog_key, 100)
Пример #17
0
    def __init__(self, func_name, unique_id, init_data=None):
        self.func_name = func_name
        self.unique_id = unique_id
        self.key = get_prog_key(func_name, unique_id)
        self.total = None
        self.increment_by = None

        # Load in the initialized data, some of this may be overloaded based
        # on the contents in the cache
        self.initialize(init_data)

        # read the data from the cache, if there is any
        self.load()
Пример #18
0
    def test_progress(self):
        """When a task finishes, it increments the progress counter properly."""
        increment = expected = 25.0
        key = decorators.get_prog_key('fake_func', self.pk)
        self.assertEqual(float(get_cache(key, 0.0)['progress']), 0.0)

        @decorators.lock_and_track
        def fake_func(import_file_pk):
            increment_cache(key, increment)

        fake_func(self.pk)

        self.assertEqual(float(get_cache(key, 0.0)['progress']), expected)
Пример #19
0
    def test_progress(self):
        """When a task finishes, it increments the progress counter properly."""
        increment = expected = 25.0
        key = decorators.get_prog_key('fake_func', self.pk)
        self.assertEqual(float(cache.get(key, 0.0)), 0.0)

        @decorators.lock_and_track
        def fake_func(import_file_pk):
            decorators.increment_cache(key, increment)

        fake_func(self.pk)

        self.assertEqual(float(cache.get(key, 0.0)), expected)
Пример #20
0
def finish_cleansing(file_pk):
    """
    Chord that is called after the cleansing is complete

    :param file_pk: import file primary key
    :return:
    """

    prog_key = get_prog_key('cleanse_data', file_pk)
    result = {
        'status': 'success',
        'progress': 100,
        'message': 'cleansing complete'
    }
    set_cache(prog_key, result['status'], result)
Пример #21
0
    def test_progress(self):
        """Make sure we retrieve data from cache properly."""
        progress_key = decorators.get_prog_key('fun_func', 23)
        expected = 50.0
        cache.set(progress_key, expected)
        resp = self.client.post(
            reverse_lazy("seed:progress"),
            data=json.dumps({
                'progress_key': progress_key,
            }),
            content_type='application/json'
        )

        self.assertEqual(resp.status_code, 200)
        body = json.loads(resp.content)
        self.assertEqual(body.get('progress', 0), expected)
        self.assertEqual(body.get('progress_key', ''), progress_key)
Пример #22
0
def remap_data(import_file_pk):
    """"Delete mapped buildings for current import file, re-map them."""
    import_file = ImportFile.objects.get(pk=import_file_pk)
    # Check to ensure that the building has not already been merged.
    mapping_cache_key = get_prog_key('map_data', import_file.pk)
    if import_file.matching_done or import_file.matching_completion:
        cache.set(mapping_cache_key, 100)
        return {
            'status': 'warning', 'message': 'Mapped buildings already merged'
        }

    _remap_data.delay(import_file_pk)

    # Make sure that our mapping cache progress is reset.
    cache.set(mapping_cache_key, 0)
    # Here we also return the mapping_prog_key so that the front end can
    # follow the progress.
    return {'status': 'success', 'progress_key': mapping_cache_key}
Пример #23
0
def remap_data(import_file_pk):
    """"Delete mapped buildings for current import file, re-map them."""
    import_file = ImportFile.objects.get(pk=import_file_pk)
    # Check to ensure that the building has not already been merged.
    mapping_cache_key = get_prog_key('map_data', import_file.pk)
    if import_file.matching_done or import_file.matching_completion:
        cache.set(mapping_cache_key, 100)
        return {
            'status': 'warning', 'message': 'Mapped buildings already merged'
        }

    _remap_data.delay(import_file_pk)

    # Make sure that our mapping cache progress is reset.
    cache.set(mapping_cache_key, 0)
    # Here we also return the mapping_prog_key so that the front end can
    # follow the progress.
    return {'status': 'success', 'progress_key': mapping_cache_key}
Пример #24
0
    def test_remap_buildings(self):
        """Test good case for resetting mapping."""
        # Make raw BSes, these should stick around.
        for x in range(10):
            test_util.make_fake_snapshot(self.import_file, {}, ASSESSED_RAW)

        # Make "mapped" BSes, these should get removed.
        for x in range(10):
            test_util.make_fake_snapshot(self.import_file, {}, ASSESSED_BS)

        # Set import file like we're done mapping
        self.import_file.mapping_done = True
        self.import_file.mapping_progress = 100
        self.import_file.save()

        # Set cache like we're done mapping.
        cache_key = decorators.get_prog_key('map_data', self.import_file.pk)
        cache.set(cache_key, 100)

        resp = self.client.post(
            reverse_lazy("seed:remap_buildings"),
            data=json.dumps({
                'file_id': self.import_file.pk,
            }),
            content_type='application/json'
        )

        self.assertEqual(resp.status_code, 200)
        self.assertEqual(
            BuildingSnapshot.objects.filter(
                import_file=self.import_file,
                source_type__in=(ASSESSED_BS, PORTFOLIO_BS)
            ).count(),
            0
        )

        self.assertEqual(
            BuildingSnapshot.objects.filter(
                import_file=self.import_file,
            ).count(),
            10
        )

        self.assertEqual(cache.get(cache_key), 0)
Пример #25
0
def match_buildings(file_pk):
    """kicks off system matching, returns progress key"""
    import_file = ImportFile.objects.get(pk=file_pk)
    if import_file.matching_done:
        prog_key = get_prog_key('match_buildings', file_pk)
        cache.set(prog_key, 100)
        return {'status': 'warning', 'message': 'matching already complete'}

    if not import_file.mapping_done:
        # Re-add to the queue, hopefully our mapping will be done by then.
        match_buildings.apply_async(args=[file_pk], countdown=10, expires=20)
        return {
            'status': 'error',
            'message': 'waiting for mapping to complete'
        }

    _match_buildings.delay(file_pk)

    return {'status': 'success'}
Пример #26
0
def match_buildings(file_pk, user_pk):
    """kicks off system matching, returns progress key"""
    import_file = ImportFile.objects.get(pk=file_pk)
    if import_file.matching_done:
        prog_key = get_prog_key('match_buildings', file_pk)
        cache.set(prog_key, 100)
        return {'status': 'warning', 'message': 'matching already complete'}

    if not import_file.mapping_done:
        # Re-add to the queue, hopefully our mapping will be done by then.
        match_buildings.apply_async(
            args=[file_pk, user_pk], countdown=10, expires=20
        )
        return {
            'status': 'error',
            'message': 'waiting for mapping to complete'
        }

    _match_buildings.delay(file_pk, user_pk)

    return {'status': 'success'}
Пример #27
0
def _save_raw_green_button_data(file_pk, *args, **kwargs):
    """
    Pulls identifying information out of the xml data, find_or_creates
    a building_snapshot for the data, parses and stores the timeseries
    meter data and associates it with the building snapshot.
    """

    import_file = ImportFile.objects.get(pk=file_pk)

    import_file.raw_save_done = True
    import_file.save()

    res = xml_importer.import_xml(import_file)

    prog_key = get_prog_key('save_raw_data', file_pk)
    cache.set(prog_key, 100)

    if res:
        return {'status': 'success'}

    return {'status': 'error', 'message': 'data failed to import'}
Пример #28
0
def delete_organization_inventory(request):
    """
    Starts a background task to delete all properties & taxlots
    in an org.

    :DELETE: Expects 'org_id' for the organization.

    Returns::

        {
            'status': 'success' or 'error',
            'progress_key': ID of background job, for retrieving job progress
        }
    """
    org_id = request.query_params.get('organization_id', None)
    deleting_cache_key = get_prog_key('delete_organization_inventory', org_id)
    tasks.delete_organization_inventory.delay(org_id, deleting_cache_key)
    return JsonResponse({
        'status': 'success',
        'progress': 0,
        'progress_key': deleting_cache_key
    })
Пример #29
0
    def cleansing_progress(self, request, pk=None):
        """
        Return the progress of the cleansing.
        ---
        type:
            status:
                required: true
                type: string
                description: either success or error
            progress:
                type: integer
                description: status of background cleansing task
        parameter_strategy: replace
        parameters:
            - name: pk
              description: Import file ID
              required: true
              paramType: path
        """

        import_file_id = pk
        prog_key = get_prog_key('get_progress', import_file_id)
        cache = get_cache(prog_key)
        return HttpResponse(cache['progress'])
Пример #30
0
def _save_raw_green_button_data(file_pk, *args, **kwargs):
    """
    Pulls identifying information out of the xml data, find_or_creates
    a building_snapshot for the data, parses and stores the timeseries
    meter data and associates it with the building snapshot.
    """

    import_file = ImportFile.objects.get(pk=file_pk)

    import_file.raw_save_done = True
    import_file.save()

    res = xml_importer.import_xml(import_file)

    prog_key = get_prog_key('save_raw_data', file_pk)
    cache.set(prog_key, 100)

    if res:
        return {'status': 'success'}

    return {
        'status': 'error',
        'message': 'data failed to import'
    }
Пример #31
0
 def test_get_prog_key(self):
     """We format our cache key properly."""
     expected = cache.make_key('SEED:fun_func:PROG:34')
     self.assertEqual(decorators.get_prog_key('fun_func', 34), expected)
Пример #32
0
def _match_buildings(file_pk):
    """ngram search against all of the canonical_building snapshots for org."""
    min_threshold = settings.MATCH_MIN_THRESHOLD
    import_file = ImportFile.objects.get(pk=file_pk)
    prog_key = get_prog_key('match_buildings', file_pk)
    org = Organization.objects.filter(
        users=import_file.import_record.owner
    )[0]

    unmatched_buildings = find_unmatched_building_values(import_file)

    # If we don't find any unmatched buildings, there's nothing left to do.
    if not unmatched_buildings:
        _finish_matching(import_file, prog_key)
        return

    # Here we want all the values not related to the BS id for doing comps.
    unmatched_ngrams = [
        _stringify(list(values)[1:]) for values in unmatched_buildings
    ]

    canonical_buildings = find_canonical_building_values(org)
    if not canonical_buildings:
        # There are no canonical_buildings for this organization, all unmatched
        # buildings will then become canonicalized.
        hydrated_unmatched_buildings = BuildingSnapshot.objects.filter(
            pk__in=[item[0] for item in unmatched_buildings]
        )
        num_unmatched = len(unmatched_ngrams) or 1
        increment = 1.0 / num_unmatched * 100
        for (i, unmatched) in enumerate(hydrated_unmatched_buildings):
            initialize_canonical_building(unmatched)
            if i % 100 == 0:
                increment_cache(prog_key, increment * 100)

        _finish_matching(import_file, prog_key)
        return

    # This allows us to retrieve the PK for a given NGram after a match.
    can_rev_idx = {
        _stringify(value[1:]): value[0] for value in canonical_buildings
    }
    n = ngram.NGram(
        [_stringify(values[1:]) for values in canonical_buildings]
    )

    # For progress tracking

    num_unmatched = len(unmatched_ngrams) or 1
    increment = 1.0 / num_unmatched * 100

    # PKs when we have a match.
    import_file.mapping_completion = 0
    import_file.save()
    for i, building in enumerate(unmatched_ngrams):
        results = n.search(building, min_threshold)
        if results:
            handle_results(results, i, can_rev_idx, unmatched_buildings)

        else:
            hydrated_building = BuildingSnapshot.objects.get(
                pk=unmatched_buildings[i][0]
            )
            initialize_canonical_building(hydrated_building)

        if i % 100 == 0:
            increment_cache(prog_key, increment * 100)
            import_file.mapping_completion += int(increment * 100)
            import_file.save()

    _finish_matching(import_file, prog_key)
    return {'status': 'success'}
Пример #33
0
 def test_get_prog_key(self):
     """We format our cache key properly."""
     expected = make_key('SEED:fun_func:PROG:' + str(self.pk))
     self.assertEqual(decorators.get_prog_key('fun_func', self.pk),
                      expected)
Пример #34
0
def _match_buildings(file_pk, user_pk):
    """ngram search against all of the canonical_building snapshots for org."""
    #     assert True
    min_threshold = settings.MATCH_MIN_THRESHOLD
    import_file = ImportFile.objects.get(pk=file_pk)
    prog_key = get_prog_key('match_buildings', file_pk)
    org = Organization.objects.filter(users=import_file.import_record.owner)[0]
    test = ''
    unmatched_buildings = find_unmatched_buildings(import_file)

    duplicates = []
    newly_matched_building_pks = []

    #Filter out matches based on ID.
    #if the match is a duplicate of other existing data add it to a list
    #and indicate which existing record it is a duplicate of
    for unmatched in unmatched_buildings:
        try:
            match = handle_id_matches(unmatched, import_file, user_pk)
        except DuplicateDataError as e:
            duplicates.append(unmatched.pk)
            unmatched.duplicate_id = e.id
            unmatched.save()
            continue
        if match:
            newly_matched_building_pks.extend([match.pk, unmatched.pk])

    # Remove any buildings we just did exact ID matches with.
    unmatched_buildings = unmatched_buildings.exclude(
        pk__in=newly_matched_building_pks).values_list(*BS_VALUES_LIST)

    # If we don't find any unmatched buildings, there's nothing left to do.
    if not unmatched_buildings:
        _finish_matching(import_file, prog_key)
        return

    #here we deal with duplicates
    unmatched_buildings = unmatched_buildings.exclude(
        pk__in=duplicates).values_list(*BS_VALUES_LIST)
    if not unmatched_buildings:
        _finish_matching(import_file, prog_key)
        return
        # here we are going to normalize the addresses to match on address_1 field, this is not ideal because you could match on two locations with same address_1 but different city
    #     unmatched_normalized_addresses=[]

    unmatched_normalized_addresses = [
        _normalize_address_str(unmatched[4])
        for unmatched in unmatched_buildings
    ]
    # Here we want all the values not related to the BS id for doing comps.
    # dont do this now
    #     unmatched_ngrams = [
    #         _stringify(list(values)[1:]) for values in unmatched_buildings
    #     ]

    canonical_buildings = find_canonical_building_values(org)
    if not canonical_buildings:
        # There are no canonical_buildings for this organization, all unmatched
        # buildings will then become canonicalized.
        hydrated_unmatched_buildings = BuildingSnapshot.objects.filter(
            pk__in=[item[0] for item in unmatched_buildings])
        num_unmatched = len(unmatched_normalized_addresses) or 1
        increment = 1.0 / num_unmatched * 100
        for (i, unmatched) in enumerate(hydrated_unmatched_buildings):
            initialize_canonical_building(unmatched, user_pk)
            if i % 100 == 0:
                increment_cache(prog_key, increment * 100)

        _finish_matching(import_file, prog_key)
        return

    # This allows us to retrieve the PK for a given NGram after a match.
    can_rev_idx = {
        _normalize_address_str(value[4]): value[0]
        for value in canonical_buildings
    }
    # (SD) This loads up an ngram object with all the canonical buildings. The values are the lists of identifying data for each building
    # (SD) the stringify is given all but the first item in the values list and it concatenates each item with a space in the middle

    # we no longer need to
    #     n = ngram.NGram(
    #         [_stringify(values[1:]) for values in canonical_buildings]
    #     )
    # here we are going to normalize the addresses to match on address_1 field, this is not ideal because you could match on two locations with same address_1 but different city
    canonical_buildings_addresses = [
        _normalize_address_str(values[4]) for values in canonical_buildings
    ]
    # For progress tracking
    # sd we now use the address
    #    num_unmatched = len(unmatched_ngrams) or 1
    num_unmatched = len(unmatched_normalized_addresses) or 1
    # this code below seemed to be unclear when I was debugging so I added the brackets
    increment = (1.0 / num_unmatched) * 100

    # PKs when we have a match.
    import_file.mapping_completion = 0
    import_file.save()
    # this section spencer changed to make the exact match
    for i, un_m_address in enumerate(unmatched_normalized_addresses):
        results = _findMatches(un_m_address, canonical_buildings_addresses)
        if results:
            handle_results(results, i, can_rev_idx, unmatched_buildings,
                           user_pk)
        else:
            hydrated_building = BuildingSnapshot.objects.get(
                pk=unmatched_buildings[i][0])
            initialize_canonical_building(hydrated_building, user_pk)

        if i % 100 == 0:
            increment_cache(prog_key, increment * 100)
            import_file.mapping_completion += int(increment * 100)
            import_file.save()

    _finish_matching(import_file, prog_key)

    return {'status': 'success'}
Пример #35
0
 def test_get_prog_key(self):
     """We format our cache key properly."""
     expected = make_key('SEED:fun_func:PROG:' + str(self.pk))
     self.assertEqual(decorators.get_prog_key('fun_func', self.pk),
                      expected)
Пример #36
0
def finish_raw_save(results, file_pk):
    import_file = ImportFile.objects.get(pk=file_pk)
    import_file.raw_save_done = True
    import_file.save()
    prog_key = get_prog_key('save_raw_data', file_pk)
    cache.set(prog_key, 100)
Пример #37
0
def finish_delete(results, org_pk):
    prog_key = get_prog_key('delete_organization_buildings', org_pk)
    cache.set(prog_key, 100)
Пример #38
0
def finish_raw_save(results, file_pk):
    import_file = ImportFile.objects.get(pk=file_pk)
    import_file.raw_save_done = True
    import_file.save()
    prog_key = get_prog_key('save_raw_data', file_pk)
    cache.set(prog_key, 100)
Пример #39
0
def finish_delete(results, org_pk):
    prog_key = get_prog_key('delete_organization_buildings', org_pk)
    cache.set(prog_key, 100)
Пример #40
0
def finish_raw_save(results, file_pk):
    import_file = ImportFile.objects.get(pk=file_pk)
    import_file.raw_save_done = True
    import_file.save()
    prog_key = get_prog_key('save_raw_data', file_pk)
    cache.set(prog_key, {'status': 'success', 'progress': 100})
Пример #41
0
def _match_buildings(file_pk, user_pk):
    """ngram search against all of the canonical_building snapshots for org."""
#     assert True
    min_threshold = settings.MATCH_MIN_THRESHOLD
    import_file = ImportFile.objects.get(pk=file_pk)
    prog_key = get_prog_key('match_buildings', file_pk)
    org = Organization.objects.filter(
        users=import_file.import_record.owner
    )[0]
    test=''
    unmatched_buildings = find_unmatched_buildings(import_file)

    newly_matched_building_pks = []
    for unmatched in unmatched_buildings:
        match = handle_id_matches(unmatched, import_file, user_pk)
        if match:
            newly_matched_building_pks.extend([match.pk, unmatched.pk])

    # Remove any buildings we just did exact ID matches with.
    unmatched_buildings = unmatched_buildings.exclude(
        pk__in=newly_matched_building_pks
    ).values_list(*BS_VALUES_LIST)

    # If we don't find any unmatched buildings, there's nothing left to do.
    if not unmatched_buildings:
        _finish_matching(import_file, prog_key)
        return
    #here we are going to normalize the addresses to match on address_1 field, this is not ideal because you could match on two locations with same address_1 but different city
#     unmatched_normalized_addresses=[]
    
    unmatched_normalized_addresses = [
        _normalize_address_str(unmatched[4]) for unmatched in unmatched_buildings
    ]
    # Here we want all the values not related to the BS id for doing comps.
    # dont do this now
#     unmatched_ngrams = [
#         _stringify(list(values)[1:]) for values in unmatched_buildings
#     ]

    canonical_buildings = find_canonical_building_values(org)
    if not canonical_buildings:
        # There are no canonical_buildings for this organization, all unmatched
        # buildings will then become canonicalized.
        hydrated_unmatched_buildings = BuildingSnapshot.objects.filter(
            pk__in=[item[0] for item in unmatched_buildings]
        )
        num_unmatched = len(unmatched_normalized_addresses) or 1
        increment = 1.0 / num_unmatched * 100
        for (i, unmatched) in enumerate(hydrated_unmatched_buildings):
            initialize_canonical_building(unmatched, user_pk)
            if i % 100 == 0:
                increment_cache(prog_key, increment * 100)

        _finish_matching(import_file, prog_key)
        return
    #print value[]
    
    # This allows us to retrieve the PK for a given NGram after a match.
    can_rev_idx = {
        _normalize_address_str(value[4]): value[0] for value in canonical_buildings
    }
    # (SD) This loads up an ngram object with all the canonical buildings. The values are the lists of identifying data for each building
    # (SD) the stringify is given all but the first item in the values list and it concatenates each item with a space in the middle
    
    #we no longer need to
#     n = ngram.NGram(
#         [_stringify(values[1:]) for values in canonical_buildings]
#     )
    #here we are going to normalize the addresses to match on address_1 field, this is not ideal because you could match on two locations with same address_1 but different city
    canonical_buildings_addresses = [
        _normalize_address_str(values[4]) for values in canonical_buildings
    ]
    # For progress tracking
# sd we now use the address
#    num_unmatched = len(unmatched_ngrams) or 1
    num_unmatched = len(unmatched_normalized_addresses) or 1
    #this code below seemed to be unclear when I was debugging so I added the brackets
    increment = (1.0 / num_unmatched) * 100

    # PKs when we have a match.
    import_file.mapping_completion = 0
    import_file.save()
    # this section spencer changed to make the exact match
    for i,un_m_address in enumerate(unmatched_normalized_addresses):
        results =_findMatches(un_m_address,canonical_buildings_addresses)
        if results:
            handle_results(
                results, i, can_rev_idx, unmatched_buildings, user_pk
            )
        else:
            hydrated_building = BuildingSnapshot.objects.get(
                pk=unmatched_buildings[i][0]
            )
            initialize_canonical_building(hydrated_building, user_pk)

        if i % 100 == 0:
            increment_cache(prog_key, increment * 100)
            import_file.mapping_completion += int(increment * 100)
            import_file.save()

    _finish_matching(import_file, prog_key)
    
    return {'status': 'success'}
Пример #42
0
def finish_raw_save(results, file_pk):
    import_file = ImportFile.objects.get(pk=file_pk)
    import_file.raw_save_done = True
    import_file.save()
    prog_key = get_prog_key('save_raw_data', file_pk)
    cache.set(prog_key, {'status': 'success', 'progress': 100})
Пример #43
0
def _match_buildings(file_pk, user_pk):
    """ngram search against all of the canonical_building snapshots for org."""
    min_threshold = settings.MATCH_MIN_THRESHOLD
    import_file = ImportFile.objects.get(pk=file_pk)
    prog_key = get_prog_key('match_buildings', file_pk)
    org = Organization.objects.filter(
        users=import_file.import_record.owner
    )[0]

    unmatched_buildings = find_unmatched_buildings(import_file)

    newly_matched_building_pks = []
    for unmatched in unmatched_buildings:
        match = handle_id_matches(unmatched, import_file, user_pk)
        if match:
            newly_matched_building_pks.extend([match.pk, unmatched.pk])

    # Remove any buildings we just did exact ID matches with.
    unmatched_buildings = unmatched_buildings.exclude(
        pk__in=newly_matched_building_pks
    ).values_list(*BS_VALUES_LIST)

    # If we don't find any unmatched buildings, there's nothing left to do.
    if not unmatched_buildings:
        _finish_matching(import_file, prog_key)
        return

    # Here we want all the values not related to the BS id for doing comps.
    unmatched_ngrams = [
        _stringify(list(values)[1:]) for values in unmatched_buildings
    ]

    canonical_buildings = find_canonical_building_values(org)
    if not canonical_buildings:
        # There are no canonical_buildings for this organization, all unmatched
        # buildings will then become canonicalized.
        hydrated_unmatched_buildings = BuildingSnapshot.objects.filter(
            pk__in=[item[0] for item in unmatched_buildings]
        )
        num_unmatched = len(unmatched_ngrams) or 1
        increment = 1.0 / num_unmatched * 100
        for (i, unmatched) in enumerate(hydrated_unmatched_buildings):
            initialize_canonical_building(unmatched, user_pk)
            if i % 100 == 0:
                increment_cache(prog_key, increment * 100)

        _finish_matching(import_file, prog_key)
        return

    # This allows us to retrieve the PK for a given NGram after a match.
    can_rev_idx = {
        _stringify(value[1:]): value[0] for value in canonical_buildings
    }
    n = ngram.NGram(
        [_stringify(values[1:]) for values in canonical_buildings]
    )

    # For progress tracking

    num_unmatched = len(unmatched_ngrams) or 1
    increment = 1.0 / num_unmatched * 100

    # PKs when we have a match.
    import_file.mapping_completion = 0
    import_file.save()
    for i, building in enumerate(unmatched_ngrams):
        results = n.search(building, min_threshold)
        if results:
            handle_results(
                results, i, can_rev_idx, unmatched_buildings, user_pk
            )
        else:
            hydrated_building = BuildingSnapshot.objects.get(
                pk=unmatched_buildings[i][0]
            )
            initialize_canonical_building(hydrated_building, user_pk)

        if i % 100 == 0:
            increment_cache(prog_key, increment * 100)
            import_file.mapping_completion += int(increment * 100)
            import_file.save()

    _finish_matching(import_file, prog_key)
    return {'status': 'success'}