def clean(file_path=util.restaurants_file,
          collection_lane=util.standard_collection):
    import_export.init_cleaning(file_path, collection_lane)

    print("Average of different entries in other fields for a given entry")
    practicabilities = predictability_checker.check_indications(
        util.field_names,
        util.current_collection(collection_lane).find(
            {}, util.get_fields_projection()))
    pprint.pprint(practicabilities)

    print("Standardizing addresses")
    standardize_restaurant_data.standardize_addresses(collection_lane)

    print("Standardizing cities")
    standardize_restaurant_data.standardize_cities(collection_lane)

    print("Standardizing phone numbers")
    standardize_restaurant_data.standardize_phone_numbers(collection_lane)

    print("Standardizing restaurant types")
    standardize_restaurant_data.standardize_restaurant_types(collection_lane)

    print("Average of different entries in other fields for a given entry")
    practicabilities = predictability_checker.check_indications(
        util.field_names,
        util.current_collection(collection_lane).find(
            {}, util.get_fields_projection()))
    pprint.pprint(practicabilities)
예제 #2
0
def standardize_restaurant_types(collection_lane):
    current_collection = util.current_collection(collection_lane)
    data = current_collection.find({})
    next_collection = util.go_to_next_stage(collection_lane)
    new_data = list()

    containing_numbers = re.compile(r" \d.*\d ")
    split_points = re.compile(r"(?: and |/)")

    replace_dict = {"bbq": "barbecue"}

    for entry in data:
        type_content = entry[util.type_field]

        result = containing_numbers.search(type_content)
        if result:
            type_content = type_content[containing_numbers.search(type_content
                                                                  ).end():]

        type_content = split_points.split(type_content)

        for i, content in enumerate(type_content):
            type_content[i] = replace_dict.get(content, content)

        entry[util.type_field] = type_content
        new_data.append(entry)
    next_collection.insert_many(new_data)
예제 #3
0
def standardize_cities(collection_lane):
    current_collection = util.current_collection(collection_lane)
    data = current_collection.find({})
    next_collection = util.go_to_next_stage(collection_lane)
    new_data = list()

    replace_dict = {
        'la': 'los angeles',
        'west la': 'los angeles',
        'w. hollywood': 'west hollywood',
        'new york': 'new york city',
        'st. boyle hts.': 'boyle heights',
    }
    district_dict = util.invert_dictionary_lists(util.districts)

    for entry in data:
        std_city = entry.get(util.city_field)
        if not std_city:
            print("entry with missing '{}' field".format(util.city_field))
            next_collection.save(entry)
            continue

        std_city = replace_dict.get(std_city, std_city)
        std_city = district_dict.get(std_city, std_city)

        entry[util.city_field] = std_city
        new_data.append(entry)
    next_collection.insert_many(new_data)
def import_restaurants_data(file_path, collection_lane):
    cur_collection = util.current_collection(collection_lane)
    if 0 < cur_collection.estimated_document_count():
        print("Data already imported. Deleting {} entries.".format(cur_collection.estimated_document_count()))
        cur_collection.delete_many({})

    dicts = import_tsv(file_path)["data"]

    print("Importing data...")
    cur_collection.insert_many(dicts)
    print("done ({} entries)".format(cur_collection.estimated_document_count()))
def aggregate_street_types():
    addr_col = [entry.get(util.address_field) for entry in util.current_collection().find({}, {util.id_pm: 0, util.address_field: 1})]
    directions = re.compile(r"( (at|near|between|off|in) )")
    street_type = re.compile(r'\b\S+\.?(?: [a-z]\.?)?$',re.IGNORECASE)
    addr_col = [row for row in
                map(lambda x: x[:re.search(directions, x).start() if re.search(directions, x) else len(x)].strip(),
                    addr_col)]
    addr_col = [row for row in
                map(lambda x: x[:re.search(r"\.", x).start() + 1 if re.search(r"\.", x) else len(x)].strip(), addr_col)]
    addr_col = [match.group(0) for match in map(lambda x: re.search(street_type, x), addr_col)]
    temp_collection = util.get_temp_collection()
    addr_col = [{util.address_field:address} for address in addr_col]
    temp_collection.insert_many(addr_col)

    aggregates = group_and_count(temp_collection.name, util.address_field)

    for entry in aggregates:
        print(entry)
예제 #6
0
def standardize_phone_numbers(collection_lane):
    current_collection = util.current_collection(collection_lane)
    data = current_collection.find({})
    next_collection = util.go_to_next_stage(collection_lane)
    new_data = list()
    non_number = re.compile(r"\D+")
    non_number_start_end = re.compile(r"(^\D+)|(\D+$)")

    for entry in data:
        phone: str = entry.get(util.phone_field)

        if not phone:
            print("entry with missing '{}' field".format(
                util.phone_field_field))
            next_collection.save(entry)
            continue

        phone = re.sub(non_number_start_end, "", phone)
        phone = re.sub(non_number, "-", phone)
        entry[util.phone_field] = phone

        new_data.append(entry)
    next_collection.insert_many(new_data)
def get_similarity_values(collection_lane):
    tokenized_data = get_tokenized_data(
        list(util.current_collection(collection_lane).find({})))
    num_entries = len(tokenized_data)
    entry_comparisons = 4
    similarity_values = {}
    string_matchers = {}
    measured_fields = [util.phone_field, util.address_field, util.name_field]

    for field in util.field_names:
        string_matchers[field] = string_matching.SoftTfIdf(get_corpus_list(
            tokenized_data, field),
                                                           threshold=0.9)

    for field in measured_fields:

        if field == util.phone_field:
            tokenized_data.sort(key=lambda x: "".join(x[field]))
        else:
            tokenized_data.sort(key=lambda x: "".join(sorted(x[field])))

        for i in range(0, num_entries):
            i_id = tokenized_data[i][util.id_field][0]
            if i_id not in similarity_values:
                similarity_values[i_id] = {}

            for j in range(i + 1, min(i + 1 + entry_comparisons, num_entries)):
                j_id = tokenized_data[j][util.id_field][0]
                if j_id not in similarity_values[i_id]:
                    similarity_values[i_id][j_id] = {}

                for field_to_check in measured_fields:
                    if field_to_check not in similarity_values[i_id][j_id]:
                        similarity_values[i_id][j_id][field_to_check] = \
                            string_matchers[field_to_check].get_raw_score(tokenized_data[i][field_to_check],
                                                                          tokenized_data[j][field_to_check])
    return similarity_values
예제 #8
0
def standardize_addresses(collection_lane):
    current_collection = util.current_collection(collection_lane)
    data = [a for a in current_collection.find({})]
    next_collection = util.go_to_next_stage(collection_lane)
    new_data = list()

    street_types = defaultdict(set)

    directions = re.compile(r"( (at|near|between|off|in) )")

    written_numbers = {
        "first": "1st",
        "second": "2nd",
        "third": "3rd",
        "fourth": "4th",
        "fifth": "5th",
        "sixth": "6th",
        "seventh": "7th",
        "eighth": "8th",
        "ninth": "9th",
        "tenth": "10th",
        "eleventh": "11th",
        "twelfth": "12th"
    }
    written_numbers_re = re.compile(r"(?P<num>{})".format("|".join(
        written_numbers.keys())))

    abbreviations = "|".join(
        util.invert_dictionary_lists(util.street_suffix_abbreviations))
    abbr_replacement = re.compile(
        r" (?P<abbr>{})\.?( |$)".format(abbreviations))
    abbr_lookup = util.invert_dictionary_lists(
        util.street_suffix_abbreviations)

    double_space = re.compile("  ")

    for entry in data:
        address = entry.get(util.address_field)
        original = address

        if not address:
            print("entry with missing '{}' field".format(util.address_field))
            next_collection.save(entry)
            continue

        result = directions.search(address)
        if result:
            address = address[:result.start()]

        results = abbr_replacement.finditer(address)
        for result in results:
            address_1 = address[:result.start()]
            address_2 = abbr_lookup[result.group('abbr')]
            address_3 = address[result.end():]
            address_1 = address_1.strip() + " "
            address_3 = " " + address_3 if address_3 != "" else ""
            address = address_1 + address_2 + address_3

        results = written_numbers_re.finditer(address)
        for result in results:
            address_1 = address[:result.start()]
            address_2 = written_numbers[result.group('num')]
            address_3 = address[result.end():]
            address = address_1 + address_2 + address_3

        result = double_space.search(address)
        if result:
            address = address[:result.start()]

        address = address.strip()

        audit_street_type(street_types, address, original)

        entry[util.address_field] = address
        new_data.append(entry)

    next_collection.insert_many(new_data)

    not_expected_count = sum(map(lambda x: len(street_types[x]), street_types))
    total_count = len(data)
    ratio_not_expected = not_expected_count / total_count * 100
    ratio_expected = 100 - ratio_not_expected
    print("Not expected:       {}/{}".format(not_expected_count, total_count))
    print("Ratio not expected: {:5.1f}%".format(ratio_not_expected))
    print("Ratio expected:     {:5.1f}%".format(ratio_expected))
    return street_types