def get_canonical_items():
    """
    Fetch all `Facility` items and create a dictionary suitable for use by a
    Dedupe model.

    Returns:
    A dictionary. The key is the `Facility` OAR ID. The value is a dictionary
    of clean field values keyed by field name (country, name, address). A
    "clean" value is one which has been passed through the `clean` function.
    """
    facility_set = Facility.objects.all().extra(select={
        'country': 'country_code'
    }).values('id', 'country', 'name', 'address')

    items = {
        str(i['id']): {k: clean(i[k])
                       for k in i if k != 'id'}
        for i in facility_set
    }

    confirmed_items = {
        match_to_extended_facility_id(m): {
            'country': clean(m.facility_list_item.country_code),
            'name': clean(m.facility_list_item.name),
            'address': clean(m.facility_list_item.address),
        }
        for m in FacilityMatch.objects.filter(status=FacilityMatch.CONFIRMED)
    }

    items.update(confirmed_items)

    return items
Пример #2
0
def populate_cleaned_fields(apps, schema_editor):
    count = 0
    FacilityListItem = apps.get_model('api', 'FacilityListItem')
    for list_item in FacilityListItem.objects.exclude(name='',
                                                      address='').iterator():
        list_item.clean_name = clean(list_item.name) or ''
        list_item.clean_address = clean(list_item.address) or ''
        list_item.save()
        count += 1
        if count % 1000 == 0:
            print('Filled ' + str(count))
def get_messy_items_for_training(mod_factor=5):
    """
    Fetch a subset of `FacilityListItem` objects that have been parsed and are
    not in an error state.

    Arguments:
    mod_factor -- Used to partition a subset of `FacilityListItem` records. The
                  larger the value, the fewer records will be contained in the
                  subset.

    Returns:
    A dictionary. The key is the `FacilityListItem` ID. The value is a
    dictionary of clean field values keyed by field name (country, name,
    address). A "clean" value is one which has been passed through the `clean`
    function.
    """
    facility_list_item_set = FacilityListItem.objects.exclude(
        Q(status=FacilityListItem.UPLOADED)
        | Q(status=FacilityListItem.ERROR)
        | Q(status=FacilityListItem.ERROR_PARSING)
        | Q(status=FacilityListItem.ERROR_GEOCODING)
        | Q(status=FacilityListItem.ERROR_MATCHING)).extra(
            select={
                'country': 'country_code'
            }).values('id', 'country', 'name', 'address')
    records = [
        record for (i, record) in enumerate(facility_list_item_set)
        if i % mod_factor == 0
    ]
    return {
        str(i['id']): {k: clean(i[k])
                       for k in i if k != 'id'}
        for i in records
    }
def get_messy_items_from_facility_list(facility_list):
    """
    Fetch all `FacilityListItem` objects that belong to the specified
    `FacilityList` and create a dictionary suitable for use by a Dedupe model.

    Arguments:
    facility_list -- A `FacilityList`.

    Returns:
    A dictionary. The key is the `FacilityListItem` ID. The value is a
    dictionary of clean field values keyed by field name (country, name,
    address). A "clean" value is one which has been passed through the `clean`
    function.
    """
    facility_list_item_set = facility_list.source.facilitylistitem_set.filter(
        Q(status=FacilityListItem.GEOCODED)
        | Q(status=FacilityListItem.GEOCODED_NO_RESULTS)).extra(
            select={
                'country': 'country_code'
            }).values('id', 'country', 'name', 'address')
    return {
        str(i['id']): {k: clean(i[k])
                       for k in i if k != 'id'}
        for i in facility_list_item_set
    }
def exact_match_items(messy, contributor):
    started = str(datetime.utcnow())

    matched_items = FacilityListItem.objects \
        .filter(status__in=[FacilityListItem.MATCHED,
                            FacilityListItem.CONFIRMED_MATCH]) \
        .exclude(facility_id=None)
    active_item_ids = FacilityMatch.objects \
        .filter(status__in=[FacilityMatch.AUTOMATIC,
                            FacilityMatch.CONFIRMED,
                            FacilityMatch.MERGED],
                is_active=True,
                facility_list_item__source__is_active=True) \
        .values_list('facility_list_item', flat=True)

    results = dict()

    for messy_id, item in messy.items():
        clean_name = clean(item.get('name', ''))
        clean_address = clean(item.get('address', ''))
        country_code = item.get('country', '').upper()
        empty_text_fields = Q(
            Q(clean_name__isnull=True) | Q(clean_name__exact='')
            | Q(clean_address__isnull=True) | Q(clean_address__exact=''))
        exact_matches = matched_items.filter(clean_name=clean_name,
                                             clean_address=clean_address,
                                             country_code=country_code) \
            .exclude(empty_text_fields) \
            .values('id', 'facility_id', 'source__contributor_id',
                    'updated_at')

        if len(exact_matches) > 0:
            if len(exact_matches) > 1:
                exact_matches = sort_exact_matches(exact_matches,
                                                   active_item_ids,
                                                   contributor)

            results[messy_id] = exact_matches

    finished = str(datetime.utcnow())

    return {
        'processed_list_item_ids': list(results.keys()),
        'item_matches': results,
        'started': started,
        'finished': finished
    }
Пример #6
0
def is_string_match(item, facility):
    """
    Check if a list item is an exact string match to a facility, after
    processing both through the same string cleaning operations used by the
    matcher.

    Arguments:
    item -- A `FacilityListItem` instance being considered as a potential match
            to the specified facility.
    facility -- A `Facility` instance.

    Returns:
    True if the item is a string match to the facility
    """
    return (item.country_code == facility.country_code
            and clean(item.name) == clean(facility.name)
            and clean(item.address) == clean(facility.address))
def facility_values_to_dedupe_record(facility_dict):
    """
    Convert a dictionary with id, country, name, and address keys into a
    dictionary suitable for training and indexing a Dedupe model.

    Arguments:
    facility_dict -- A dict with id, country, name, and address key created
                     from a `Facility` values query.

    Returns:
    A dictionary with the id as the key and a dictionary of fields
    as the value.
    """
    return {
        str(facility_dict['id']): {
            "country": clean(facility_dict['country']),
            "name": clean(facility_dict['name']),
            "address": clean(facility_dict['address']),
        }
    }
def match_item(country,
               name,
               address,
               id='id',
               automatic_threshold=MatchDefaults.AUTOMATIC_THRESHOLD,
               gazetteer_threshold=MatchDefaults.GAZETTEER_THRESHOLD,
               recall_weight=MatchDefaults.RECALL_WEIGHT):
    """
    Match the details of a single facility to the list of existing facilities.

    Arguments:
    country -- A valid country name or 2-character ISO code.
    name -- The name of the facility.
    address -- The address of the facility.
    id -- The key value in the returned match results.
    automatic_threshold -- A number from 0.0 to 1.0. A match with a confidence
                           score greater than this value will be assigned
                           automatically.
    gazetteer_threshold -- A number from 0.0 to 1.0. A match with a confidence
                           score between this value and the
                           `automatic_threshold` will be considers a match that
                           requires confirmation.
    recall_weight -- Sets the tradeoff between precision and recall. A value of
                     1.0 give an equal weight to precision and recall.
                     https://en.wikipedia.org/wiki/Precision_and_recall
                     https://docs.dedupe.io/en/latest/Choosing-a-good-threshold.html

    Returns:
    See `match_items`.
    """
    return match_items(
        {
            str(id): {
                "country": clean(country),
                "name": clean(name),
                "address": clean(address)
            }
        },
        automatic_threshold=automatic_threshold,
        gazetteer_threshold=gazetteer_threshold,
        recall_weight=recall_weight)
Пример #9
0
def process_facility_and_processing_type_claim_values(claim, apps):
    facility_type = getattr(claim, 'facility_type')
    facility_value = {
        'raw_values': [],
        'matched_values': [],
    }
    claim.facility_type = None
    facility_value['raw_values'].append(facility_type)
    if value_is_valid(facility_type):
        result = get_facility_and_processing_type(facility_type)
        if result[0] is not None:
            claim.facility_type = clean(result[2])
            facility_value['matched_values'].append(result)
            create_field('facility_type', facility_value, claim, apps)

    processing_types = getattr(claim, 'facility_production_types')
    if processing_types is not None and isinstance(processing_types, str):
        processing_types = (processing_types.split('|')
                            if '|' in processing_types else [processing_types])
    if processing_types is not None:
        processing_value = {
            'raw_values': list(processing_types),
            'matched_values': [],
        }
        claim_values = []
        for value in list(processing_types):
            if value_is_valid(value):
                result = get_facility_and_processing_type(value)
                if result[0] is not None:
                    processing_value['matched_values'].append(result)
                    claim_values.append(clean(result[3]))
        if len(processing_value['matched_values']) > 0:
            create_field('processing_type', processing_value, claim, apps)
            claim.facility_production_types = claim_values

    claim.save()
Пример #10
0
def parse_facility_list_item(item):
    started = str(datetime.utcnow())
    if type(item) != FacilityListItem:
        raise ValueError('Argument must be a FacilityListItem')
    if item.status != FacilityListItem.UPLOADED:
        raise ValueError('Items to be parsed must be in the UPLOADED status')
    try:
        is_geocoded = False
        fields = [
            f.lower() for f in parse_csv_line(item.source.facility_list.header)
        ]
        values = parse_csv_line(item.raw_data)

        # facility_type_processing_type is a special "meta" field that attempts
        # to simplify the submission process for contributors.
        if 'facility_type_processing_type' in fields:
            if 'facility_type' not in fields:
                fields.append('facility_type')
                values.append(
                    values[fields.index('facility_type_processing_type')])
            if 'processing_type' not in fields:
                fields.append('processing_type')
                values.append(
                    values[fields.index('facility_type_processing_type')])

        if CsvHeaderField.COUNTRY in fields:
            item.country_code = get_country_code(values[fields.index(
                CsvHeaderField.COUNTRY)])
        if CsvHeaderField.NAME in fields:
            item.name = values[fields.index(CsvHeaderField.NAME)]
            item.clean_name = clean(item.name)
            if item.clean_name is None:
                item.clean_name = ''
        if CsvHeaderField.ADDRESS in fields:
            item.address = values[fields.index(CsvHeaderField.ADDRESS)]
            item.clean_address = clean(item.address)
            if item.clean_address is None:
                item.clean_address = ''
        if CsvHeaderField.LAT in fields and CsvHeaderField.LNG in fields:
            lat = float(values[fields.index(CsvHeaderField.LAT)])
            lng = float(values[fields.index(CsvHeaderField.LNG)])
            item.geocoded_point = Point(lng, lat)
            is_geocoded = True

        if CsvHeaderField.PPE_PRODUCT_TYPES in fields:
            product_types = values[fields.index(
                CsvHeaderField.PPE_PRODUCT_TYPES)]
            # The nested list comprehension ensures that we filter out
            # whitespace-only values
            item.ppe_product_types = \
                [s for s in [s.strip() for s in product_types.split('|')] if s]
        if CsvHeaderField.PPE_CONTACT_PHONE in fields:
            item.ppe_contact_phone = values[fields.index(
                CsvHeaderField.PPE_CONTACT_PHONE)]
        if CsvHeaderField.PPE_CONTACT_EMAIL in fields:
            item.ppe_contact_email = values[fields.index(
                CsvHeaderField.PPE_CONTACT_EMAIL)]
        if CsvHeaderField.PPE_WEBSITE in fields:
            item.ppe_website = values[fields.index(CsvHeaderField.PPE_WEBSITE)]

        create_extendedfields_for_listitem(item, fields, values)

        try:
            item.full_clean(exclude=('processing_started_at',
                                     'processing_completed_at',
                                     'processing_results', 'geocoded_point',
                                     'facility'))
            item.status = FacilityListItem.PARSED
            item.processing_results.append({
                'action':
                ProcessingAction.PARSE,
                'started_at':
                started,
                'error':
                False,
                'finished_at':
                str(datetime.utcnow()),
                'is_geocoded':
                is_geocoded,
            })
        except ValidationError as ve:
            messages = []
            for name, errors in ve.error_dict.items():
                # We need to clear the invalid value so we can save the row
                setattr(item, name, '')
                error_str = ''.join(''.join(e.messages) for e in errors)
                messages.append('There is a problem with the {0}: {1}'.format(
                    name, error_str))

            # If there is a validation error on the `ppe_product_types` array
            # field, `full_clean` appears to set it to an empty string which
            # then causes `save` to raise an exception.
            ppe_product_types_is_valid = (item.ppe_product_types is None
                                          or isinstance(
                                              item.ppe_product_types, list))
            if not ppe_product_types_is_valid:
                item.ppe_product_types = []

            item.status = FacilityListItem.ERROR_PARSING
            item.processing_results.append({
                'action':
                ProcessingAction.PARSE,
                'started_at':
                started,
                'error':
                True,
                'message':
                '\n'.join(messages),
                'trace':
                traceback.format_exc(),
                'finished_at':
                str(datetime.utcnow()),
            })
    except Exception as e:
        item.status = FacilityListItem.ERROR_PARSING
        item.processing_results.append({
            'action': ProcessingAction.PARSE,
            'started_at': started,
            'error': True,
            'message': str(e),
            'trace': traceback.format_exc(),
            'finished_at': str(datetime.utcnow()),
        })