Пример #1
0
def clean_raw_locations_from_file(inputfilename, outputfilename):
    inputfile = open(inputfilename, 'r')
    outputfile = open(outputfilename, 'w+')
    for line in inputfile:
        line = line.decode('utf8')
        line = geoalchemy_util.clean_raw_location(line)
        line = line.encode('utf8')
        outputfile.write(line)
Пример #2
0
def clean_raw_locations_from_file(inputfilename, outputfilename):
    inputfile = open(inputfilename, 'r')
    outputfile = open(outputfilename, 'w+')
    for line in inputfile:
        line = line.decode('utf8')
        line = geoalchemy_util.clean_raw_location(line)
        line = line.encode('utf8')
        outputfile.write(line)
Пример #3
0
def clean_raw_locations_from_file(inputfilename, outputfilename):
    inputfile = open(inputfilename, "r")
    outputfile = open(outputfilename, "w+")
    for line in inputfile:
        line = line.decode("utf8")
        line = geoalchemy_util.clean_raw_location(line)
        line = line.encode("utf8")
        outputfile.write(line)
Пример #4
0
def analyze_input_addresses(inputfilename):
    construct_valid_input_address_list()
    print datetime.datetime.now()
    inputfile = open(inputfilename, 'r')
    line_count = 0
    good_count = 0
    exists_in_all_cities_count = 0
    #not_found_file = open('not_found.txt', 'w+')
    for line in inputfile:
        line = line.decode('utf8')
        input_address = geoalchemy_util.clean_raw_location(line)
        if input_address_exists(input_address):
            good_count += 1
        #else:
        #not_found_file.write('{0}\n'.format(input_address.encode('utf8')))
        line_count += 1
    print 'All lines compared!'
    print '% good:', good_count * 1.0 / line_count
    print '% in all_cities:', exists_in_all_cities_count * 1.0 / line_count
    print datetime.datetime.now()
Пример #5
0
def analyze_input_addresses(inputfilename):
    valid_input_addresses = construct_valid_input_addresses()
    print datetime.datetime.now()
    inputfile = open(inputfilename, 'r')
    line_count=0
    good_count=0
    exists_in_all_cities_count=0
    #not_found_file = open('not_found.txt', 'w+')
    for line in inputfile:
        line = line.decode('utf8')
        input_address = geoalchemy_util.clean_raw_location(line)
        if input_address_exists(valid_input_addresses, input_address):
            good_count+=1
        #else:
            #not_found_file.write('{0}\n'.format(input_address.encode('utf8')))
        line_count+=1
    print 'All lines compared!'
    print '% good:', good_count*1.0/line_count
    print '% in all_cities:', exists_in_all_cities_count*1.0/line_count
    print datetime.datetime.now()
Пример #6
0
def main(limit=None, offset=0):
    t = datetime.datetime.now()
    print "geocoding started", t
    #Construct a list of all addresses which Google was capable of identifying
    #Making this now allows it to be referenced quickly later
    construct_valid_input_address_list(force_lowercase=True)
    #Get all of the raw locations in alchemy.db that were parsed from XML
    raw_parsed_locations = alchemy_session.query(
        alchemy.RawLocation).limit(limit).offset(offset)
    #If there are no locations, there is no point in continuing
    if raw_parsed_locations.count() == 0:
        return False
    print 'Constructed list of all parsed locations containing', raw_parsed_locations.count(
    ), 'items'
    """
    grouped_loations will contain a list of dicts. Each dict will contain three values:
    raw_location = Location object containing the original location found in the XML
    matching_location = RawGoogle object containing the disambiguated location
    grouping_id = ID constructed from the city, region, and country of the matching_location
    """
    grouped_locations = []
    for instance in raw_parsed_locations:
        #Convert the location into a string that matches the Google format
        parsed_raw_location = geoalchemy_util.concatenate_location(
            instance.city, instance.state, instance.country)
        cleaned_location = geoalchemy_util.clean_raw_location(
            parsed_raw_location)
        #If the cleaned location has a match in the raw_google database,
        #we use that to classify it
        if input_address_exists(cleaned_location, force_lowercase=True):
            #Find the location from the raw_google database that matches this input
            matching_location = geo_data_session.query(RawGoogle).filter(
                sqlalchemy.func.lower(RawGoogle.input_address) ==
                sqlalchemy.func.lower(cleaned_location)).first()
            grouping_id = u"{0}|{1}".format(matching_location.latitude,
                                            matching_location.longitude)
        else:
            """
            If there is no match in the raw_google database, we leave the location alone
            TODO: analyze the location's edit distance to make minor adjustments to it
            such that it can be matched. Particularly good if we can combine the
            all_cities database with the list of valid input_address values in the
            raw_google database.
            """
            print cleaned_location
            matching_location = RawGoogle(cleaned_location, '', '', '', '', '',
                                          -1)
            grouping_id = u"nolocationfound"
        grouped_locations.append({
            "raw_location": instance,
            "matching_location": matching_location,
            "grouping_id": grouping_id
        })
    print "grouped_locations created", datetime.datetime.now() - t
    t = datetime.datetime.now()
    #We now have a list of all locations in the file, along with their
    #matching locations and the id used to group them
    #Sort the list by the grouping_id
    keyfunc = lambda x: x['grouping_id']
    grouped_locations.sort(key=keyfunc)
    #Create an iterator that will access everything in the list with the same
    #grouping_id
    grouped_locations_enum = enumerate(
        itertools.groupby(grouped_locations, keyfunc))
    print "grouped_locations sorted", datetime.datetime.now() - t
    t = datetime.datetime.now()
    #Match the locations
    match_grouped_locations(grouped_locations_enum, t)

    alchemy_session.commit()

    print "Matches made!", datetime.datetime.now() - t
    unique_group_count = alchemy_session.query(
        expression.func.count(sqlalchemy.distinct(alchemy.Location.id))).all()
    print "%s groups formed from %s locations" % (unique_group_count,
                                                  raw_parsed_locations.count())
Пример #7
0
def main(limit=None, offset=0, minimum_match_value=0.8, doctype='grant'):
    alchemy_session = alchemy.fetch_session(dbtype=doctype)
    t = datetime.datetime.now()
    print "geocoding started", doctype, t
    #Construct a list of all addresses which Google was capable of identifying
    #Making this now allows it to be referenced quickly later
    valid_input_addresses = construct_valid_input_addresses()
    #Get all of the raw locations in alchemy.db that were parsed from XML
    if doctype == 'grant':
        raw_parsed_locations = alchemy_session.query(alchemy.schema.RawLocation).limit(limit).offset(offset)
    elif doctype == 'application':
        raw_parsed_locations = alchemy_session.query(alchemy.schema.App_RawLocation).limit(limit).offset(offset)
    raw_parsed_locations_count = raw_parsed_locations.count()

    #If there are no locations, there is no point in continuing
    if raw_parsed_locations_count == 0:
        return False
    print 'Constructed list of all parsed locations containing', raw_parsed_locations_count, 'items'
    """
    grouped_loations will contain a list of dicts. Each dict will contain three values:
    raw_location = Location object containing the original location found in the XML
    matching_location = RawGoogle object containing the disambiguated location
    grouping_id = ID constructed from the city, region, and country of the matching_location
    """
    identified_grouped_locations = []
    unidentified_grouped_locations = []
    for instance in raw_parsed_locations:
        #Convert the location into a string that matches the Google format
        parsed_raw_location = geoalchemy_util.concatenate_location(instance.city, instance.state, instance.country)
        cleaned_location = geoalchemy_util.clean_raw_location(parsed_raw_location)
        #If the cleaned location has a match in the raw_google database,
        #we use that to classify it
        if input_address_exists(valid_input_addresses, cleaned_location):
            matching_location = geo_data_session.query(RawGoogle).filter(
                                     RawGoogle.input_address==cleaned_location).first()
            if matching_location:
                grouping_id = u"{0}|{1}".format(matching_location.latitude, matching_location.longitude)
                identified_grouped_locations.append({"raw_location": instance,
                                      "matching_location": matching_location,
                                      "grouping_id": grouping_id})
            else:
                print 'Cleaned location not matched', cleaned_location
                country = geoalchemy_util.get_country_from_cleaned(cleaned_location)
                unidentified_grouped_locations.append({"raw_location": instance,
                                                       "cleaned_location": cleaned_location,
                                                       "country": country})

        else:
            """
            If there is no match in the raw_google database, we leave the location alone
            TODO: analyze the location's edit distance to make minor adjustments to it
            such that it can be matched. Particularly good if we can combine the
            all_cities database with the list of valid input_address values in the
            raw_google database.
            """
            #Sort the locations by their country
            country = geoalchemy_util.get_country_from_cleaned(cleaned_location)
            unidentified_grouped_locations.append({"raw_location": instance,
                                                   "cleaned_location": cleaned_location,
                                                   "country": country})
        if ((len(identified_grouped_locations)+len(unidentified_grouped_locations))%10000 == 0):
            print "Processed", len(identified_grouped_locations)+len(unidentified_grouped_locations), datetime.datetime.now()
    print "locations grouped", datetime.datetime.now() - t
    print 'count of identified locations:', len(identified_grouped_locations)
    t = datetime.datetime.now()
    alchemy_session.close()


    #We now have two lists of locations. First, consider the unmatched locations.
    keyfunc = lambda x:x["country"]
    #Sort the list by the country
    unidentified_grouped_locations.sort(key=keyfunc)
    #Create an iterator that will access everything in the list with the same
    #country
    unidentified_grouped_locations_enum = enumerate(itertools.groupby(unidentified_grouped_locations, keyfunc))
    #Identify the correct location for each entry by comparing to all_cities
    identify_missing_locations(unidentified_grouped_locations_enum,
                               identified_grouped_locations,
                               minimum_match_value, t)
    print 'new count of identified locations:', len(identified_grouped_locations)

    #We now have a list of all locations in the file, along with their
    #matching locations and the id used to group them
    #Perform a quickfix to correct state names
    geoalchemy_util.fix_state_abbreviations(identified_grouped_locations)

    #Sort the list by the grouping_id
    keyfunc = lambda x: x['grouping_id']
    identified_grouped_locations.sort(key=keyfunc)
    #Create an iterator that will access everything in the list with the same
    #grouping_id
    identified_grouped_locations_enum = enumerate(itertools.groupby(identified_grouped_locations, keyfunc))
    print "identified_grouped_locations sorted", datetime.datetime.now() - t
    t = datetime.datetime.now()

    alchemy_session = alchemy.fetch_session(dbtype=doctype)

    #Match the locations
    match_grouped_locations(identified_grouped_locations_enum, t, alchemy_session)

    print "Matches made!", datetime.datetime.now() - t
    if doctype == 'grant':
        unique_group_count = alchemy_session.query(expression.func.count(sqlalchemy.distinct(alchemy.schema.Location.id))).all()
    elif doctype == 'application':
        unique_group_count = alchemy_session.query(expression.func.count(sqlalchemy.distinct(alchemy.schema.App_Location.id))).all()

    print "%s groups formed from %s locations" % (unique_group_count, raw_parsed_locations_count)
    alchemy_session.close()
Пример #8
0
def main(limit=None, offset=0):
    t = datetime.datetime.now()
    print "geocoding started", t
    #Construct a list of all addresses which Google was capable of identifying
    #Making this now allows it to be referenced quickly later
    construct_valid_input_address_list(force_lowercase=True)
    #Get all of the raw locations in alchemy.db that were parsed from XML
    raw_parsed_locations = alchemy_session.query(alchemy.RawLocation).limit(limit).offset(offset)
    #If there are no locations, there is no point in continuing
    if raw_parsed_locations.count() == 0:
        return False
    print 'Constructed list of all parsed locations containing', raw_parsed_locations.count(), 'items'
    """
    grouped_loations will contain a list of dicts. Each dict will contain three values:
    raw_location = Location object containing the original location found in the XML
    matching_location = RawGoogle object containing the disambiguated location
    grouping_id = ID constructed from the city, region, and country of the matching_location
    """
    grouped_locations = []
    for instance in raw_parsed_locations:
        #Convert the location into a string that matches the Google format
        parsed_raw_location = geoalchemy_util.concatenate_location(instance.city, instance.state, instance.country)
        cleaned_location = geoalchemy_util.clean_raw_location(parsed_raw_location)
        #If the cleaned location has a match in the raw_google database,
        #we use that to classify it
        if input_address_exists(cleaned_location, force_lowercase=True):
            #Find the location from the raw_google database that matches this input
            matching_location = geo_data_session.query(RawGoogle).filter(
                                    sqlalchemy.func.lower(RawGoogle.input_address)==
                                    sqlalchemy.func.lower(cleaned_location)).first()
            grouping_id = u"{0}|{1}".format(matching_location.latitude, matching_location.longitude)
        else:
            """
            If there is no match in the raw_google database, we leave the location alone
            TODO: analyze the location's edit distance to make minor adjustments to it
            such that it can be matched. Particularly good if we can combine the
            all_cities database with the list of valid input_address values in the
            raw_google database.
            """
            print cleaned_location
            matching_location = RawGoogle(cleaned_location, '', '', '', '', '', -1)
            grouping_id = u"nolocationfound"
        grouped_locations.append({"raw_location": instance,
                                  "matching_location": matching_location,
                                  "grouping_id": grouping_id})
    print "grouped_locations created", datetime.datetime.now() - t
    t = datetime.datetime.now()
    #We now have a list of all locations in the file, along with their
    #matching locations and the id used to group them
    #Sort the list by the grouping_id
    keyfunc = lambda x: x['grouping_id']
    grouped_locations.sort(key=keyfunc)
    #Create an iterator that will access everything in the list with the same
    #grouping_id
    grouped_locations_enum = enumerate(itertools.groupby(grouped_locations, keyfunc))
    print "grouped_locations sorted", datetime.datetime.now() - t
    t = datetime.datetime.now()
    #Match the locations
    match_grouped_locations(grouped_locations_enum, t)
    
    alchemy_session.commit()

    print "Matches made!", datetime.datetime.now() - t
    unique_group_count = alchemy_session.query(expression.func.count(sqlalchemy.distinct(alchemy.Location.id))).all()
    print "%s groups formed from %s locations" % (unique_group_count, raw_parsed_locations.count())