Пример #1
0
def method_new(name="Untitled Q-Method", owner="Your Name", email="email", phone='phone', notes=''):
    #look for existing methods:
    options = os.listdir(data_path)
    new_option = ""
    #make sure that:
    #a) we have a new id and
    #b) the new id has not already been used
    while (not new_option) or (new_option in options):
        new_option = generate_id()

    #make new directory in method_path
    method_path = os.path.join(data_path, new_option)
    if not os.path.exists(method_path):
        os.makedirs(method_path)
    else:
        #This should never happen with above while loop, but just in case...
        raise ValueError, "Path exists, but it shouldn't: %s" % method_path
    
    #make an empty configuration file
    config = os.path.join(method_path, "config.json")
    result = load_json(config, create=True)
    result['name'] = name
    result['owner'] = owner
    result['email'] = email
    result['phone'] = phone
    result['notes'] = notes
    result['statements'] = """1. First sample statement
2. Second sample statement"""
    result['columns'] = '2 3 5 6 8 6 5 3 2'

    save_json(config, result)
    
    #redirect to the new method's page:        
    redirect("/method/" + new_option + "/bookmark/")
Пример #2
0
def skf_cross_validate(model, X, y):
	"""
	wrapper function to do sklearn style stratified k fold cross validation
	on a keras model. Some code borrowed from:
	https://medium.com/@literallywords/stratified-k-fold-with-keras-e57c487b1416

	input - X, training data
			y, training data labels
	        
	"""
	print("Stratified K Fold cross Validating")
	skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
	for index, (train_index, test_index) in enumerate(skf.split(X, y)):

		print("TRAIN:", train_index, "TEST:", test_index)
		X_train, X_test = X[train_index], X[test_index]
		y_train, y_test = y[train_index], y[test_index]

		# Convert class labels to categorical data/one-hot encoding
		y_test = to_categorical(y_test)
		y_train = to_categorical(y_train)


		print ('Training '+'lenet-cv-'+str(index))
		# Train model and validate
		results = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64)
		training_eval(results, 'lenet-cv-'+str(index))

		save_json(model, 'lenet-cv-'+str(index))
		model.save_weights('models/lenet-cv-'+str(index)+'_weights.h5')
def update_json(source, city_tag):
    cache_file = "%s.json" % city_tag
    cache_destination = os.path.join(os.path.dirname(source), cache_file)

    local_cache = load_json(cache_destination, create=True)

    assert local_cache.has_key('buildings')
    assert local_cache.has_key('parcels')

    locations = {}
    for key, value in local_cache['buildings'].items():
        location = Location(value)

        for source in location.sources:
            if hasattr(location, source):
                result = getattr(location, source)
                #convert from old dict format here
                if isinstance(result, dict):
                    print "Found dictionary in: %s for: %s" % (source, location.address)

                    result = [ result ]
                    setattr(location, source, result)

        locations[key] = location
        
    #back it up for later
    #enable this when downloading GPS coordinates...
    #the rest of the time it slows things down
    local_cache['buildings'] = {}
    for key, value in locations.items():
        local_cache['buildings'][key] = value.to_dict()
    save_json(cache_destination, local_cache)
Пример #4
0
def update_json(source, city_tag):
    cache_file = "%s.json" % city_tag
    cache_destination = os.path.join(os.path.dirname(source), cache_file)

    local_cache = load_json(cache_destination, create=True)

    assert local_cache.has_key('buildings')
    assert local_cache.has_key('parcels')

    locations = {}
    for key, value in local_cache['buildings'].items():
        location = Location(value)

        for source in location.sources:
            if hasattr(location, source):
                result = getattr(location, source)
                #convert from old dict format here
                if isinstance(result, dict):
                    print "Found dictionary in: %s for: %s" % (
                        source, location.address)

                    result = [result]
                    setattr(location, source, result)

        locations[key] = location

    #back it up for later
    #enable this when downloading GPS coordinates...
    #the rest of the time it slows things down
    local_cache['buildings'] = {}
    for key, value in locations.items():
        local_cache['buildings'][key] = value.to_dict()
    save_json(cache_destination, local_cache)
Пример #5
0
def subject_new(key):
    """
    create a new subject for the Q-Method specified by key
    """
    method_path = os.path.join(data_path, key)
    if not os.path.exists(method_path):
        return template('404', key=key, item="method")
    else:
        #look for existing subjects:
        options = os.listdir(method_path)
        new_option = ""
        #make sure that:
        #a) we have a new id and
        #b) the new id has not already been used
        while (not new_option) or (new_option in options):
            new_option = generate_id()

        #make new directory in method_path
        subject_path = os.path.join(method_path, new_option)
        if not os.path.exists(subject_path):
            os.makedirs(subject_path)
        else:
            #This should not ever happen with above check, but just in case...
            raise ValueError, "Subject path exists, but it shouldn't: %s" % subject_path

        #make an empty configuration file
        config = os.path.join(subject_path, "subject_config.json")
        result = load_json(config, create=True)
        #once the subject starts sorting, we will cache this locally
        #based on the current state of the method configuration
        #result['statements'] = ""
        result['columns'] = u""
        result['json'] = u""
        result['started'] = u""
        #a textual representation of where each statement is
        result['state'] = u""
        result['history'] = u""
        #is it finished? complete? this will prevent further changes:
        result['locked'] = False
        #now:
        now = datetime.now()
        result['created'] = now.strftime("%Y.%m.%d %H:%M:%S")

        # after first movement
        result['started'] = u""
        result['last_update'] = u""

        save_json(config, result)

        #redirect to the new method's page:        
        redirect("/method/" + key + "/")
Пример #6
0
def post_subject_json(mkey=None, skey=None):
    method_path = os.path.join(data_path, mkey)
    if not os.path.exists(method_path):
        return template('404', key=mkey, item="method")
    else:
        subject_path = os.path.join(method_path, skey)
        if not os.path.exists(subject_path):
            return template('404', key=skey, item="subject")        
        else:
            
            method_config = os.path.join(method_path, "config.json")
            #result = load_json(config)
            method_json_file = codecs.open(method_config, 'r', encoding='utf-8', errors='ignore')
            method_json = method_json_file.read()
            method_details = json.loads(method_json)

            subject_config = os.path.join(subject_path, "subject_config.json")
            subject_json_file = codecs.open(subject_config, 'r', encoding='utf-8', errors='ignore')
            subject_json = subject_json_file.read()
            subject_details = json.loads(subject_json)
            #subject_data = load_json(subject_config)

            #changed = False
            now = datetime.now()

            if not subject_details['columns']:
                subject_details['columns'] = method_details['columns']
                #changed = True
                
            #if request.forms.get('json') != subject_details['json']:
            #this includes the whole tree from javascript:
            #  available, placed, and soon responses
            subject_details['json'] = request.forms.get('json')
                #changed = True

            if not subject_details['started']:
                subject_details['started'] = now.strftime("%Y.%m.%d %H:%M:%S")

            subject_details['last_update'] = now.strftime("%Y.%m.%d %H:%M:%S")

            #subject_details['history'] += u"%s,%s\n" % (now.strftime("%Y.%m.%d %H:%M:%S"), request.forms.get('action'))
            #subject_details['history'] += unicode(now.strftime("%Y.%m.%d %H:%M:%S")) + u"," + unicode(request.forms.get('action'))
            #this avoids:
            # UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 99: ordinal not in range(128)

            subject_details['history'] += unicode(now.strftime("%Y.%m.%d %H:%M:%S")) + u"," + request.forms.get('action').decode("utf-8") + u"\n"
                
            save_json(subject_config, subject_details)

            return template('success')
def save_results(cache_destination, local_cache):
    #destination = '%s.tsv' % city_tag
    #save_results(locations, destination)

    #convert all results to json serializable
    for_saving = {}
    for key in local_cache.keys():
        current = local_cache[key]
        results = current['results']
        dupe = copy.copy(current)
        dupe['results'] = results.to_dict()

        for_saving[key] = dupe
        
    save_json(cache_destination, for_saving)
Пример #8
0
    def extract(self):
        """Actually run the extract process."""
        data = OrderedDict()

        for game_type in VALID_GAME_TYPES:
            click.echo(game_type)

            data[game_type] = OrderedDict()

            self._extract_radio_calls(data[game_type], game_type)
            self._extract_throwables(data[game_type], game_type)
            self._extract_weapons(data[game_type], game_type)
            self._extract_equipment(data[game_type], game_type)

            data[game_type] = OrderedDict(sorted(data[game_type].items(), key=lambda k: k[0]))

        helpers.save_json(app.config['UNLOCKABLES_DATA_FILE'], data)
Пример #9
0
    def extract(self):
        """Actually run the extract process."""
        maps_paths = []

        maps_paths.extend(glob(os.path.join(self.packages_dir, '*', 'maps', '*', 'objects.svg'))) # Maps in RWR game directory
        maps_paths.extend(glob(os.path.join(self.workshop_dir, '*', 'media', 'packages', '*', 'maps', '*', 'objects.svg'))) # Maps in RWR workshop directory

        data = OrderedDict()

        for map_path in maps_paths:
            server_type, map_id = utils.parse_map_path(map_path.replace('\\', '/').replace('/objects.svg', ''))

            if not map_id or map_id in INVALID_MAPS or server_type in INVALID_GAME_TYPES:
                click.secho('Invalid map ID ({}) or server type ({})'.format(map_id, server_type), fg='yellow')

                continue

            map_xml = etree.parse(map_path)

            map_infos = map_xml.findtext('//svg:rect[@inkscape:label=\'#general\']/svg:desc', namespaces={'svg': 'http://www.w3.org/2000/svg', 'inkscape': 'http://www.inkscape.org/namespaces/inkscape'})

            if not map_infos:
                click.secho('No general map info found', fg='yellow')

                continue

            map_infos = self._parse_map_data(map_infos)

            if 'name' not in map_infos:
                click.secho('Map name not found', fg='yellow')

                continue

            click.echo(server_type + ':' + map_id)

            if server_type not in data:
                data[server_type] = OrderedDict()

            data[server_type][map_id] = OrderedDict([
                ('name', map_infos['name'].replace('Pacific: ', '').title()),
                ('has_minimap', os.path.isfile(os.path.join(app.config['MINIMAPS_IMAGES_DIR'], server_type, map_id + '.png'))),
                ('has_preview', os.path.isfile(os.path.join(app.config['MAPS_PREVIEW_IMAGES_DIR'], server_type, map_id + '.png')))
            ])

        helpers.save_json(app.config['MAPS_DATA_FILE'], data)
Пример #10
0
def post_method_json(key=None):
    #print dir(request.forms)
    #print request.forms.keys()
    
    method_path = os.path.join(data_path, key)
    if not os.path.exists(method_path):
        return template('404', key=key, item="method")
    else:
        config = os.path.join(method_path, "config.json")
        result = load_json(config)

        changed = False
        for key in request.forms.keys():
            #special case for 'statements' key...
            #want to get rid of any extra newline characters
            #this will help calculate the number of statements more accurately
            #(rather than stripping newlines everywhere we look at statements)
            #
            #this works here, but it will make it difficult to provide
            #feedback to the user about how many statements there are
            #compared to how many spaces there are available in columns
            #adding a similar check in method.js
            if key == "statements":
                text = request.forms.get(key)
                lines = text.splitlines()
                new_lines = []
                for line in lines:
                    if line:
                        new_lines.append(line)
                value = '\n'.join(new_lines)
            else:
                value = request.forms.get(key)
            
            if value != result[key]:
                #print "%s (original) != %s (new)" % (result[key], request.forms.get(key))

                result[key] = value
                changed = True

        if changed:
            #print "METHOD CONFIG CHANGED!!!! (saving)"
            save_json(config, result)
            
        return template('success')
Пример #11
0
    def extract(self):
        """Actually run the extract process."""
        # Only handle official ranks
        ranks_files_paths = [
            { # In Vanilla, ranks from all factions are the same, inspired from the US Army
                'country': 'us',
                'path': os.path.join(self.packages_dir, 'vanilla', 'factions', 'brown.xml'),
                'game_type': 'vanilla'
            },
            { # In Pacific, US factions are the same as the Vanilla ones, so only parse IJA ranks
                'country': 'jp',
                'path': os.path.join(self.packages_dir, 'pacific', 'factions', 'ija.xml'),
                'game_type': 'pacific'
            }
        ]

        data = OrderedDict()

        for ranks_file_path in ranks_files_paths:
            click.echo(ranks_file_path['country'])

            data[ranks_file_path['country']] = OrderedDict()

            faction_xml = etree.parse(ranks_file_path['path'])
            faction_xml_root = faction_xml.getroot()

            i = 0

            for rank_node in faction_xml_root.iterchildren('rank'):
                rank_name = rank_node.get('name')

                click.echo(rank_name)

                data[ranks_file_path['country']][i] = OrderedDict([
                    ('name', rank_name),
                    ('xp', int(float(rank_node.get('xp')) * 10000))
                ])

                self._extract_images(i, ranks_file_path['game_type'], ranks_file_path['country'], rank_node.find('hud_icon').get('filename'))

                i += 1

        helpers.save_json(app.config['RANKS_DATA_FILE'], data)
Пример #12
0
def read_csv(source):
    #for reading unicode
    #f = codecs.open(source, 'r', encoding='utf-8')

    city_options = City.objects.filter(tag="ann_arbor")
    print len(city_options)
    if not len(city_options):
        city = City()
        city.name = "Ann Arbor"
        city.tag = to_tag(city.name)
        city.save()
    else:
        city = city_options[0]

    print city

    #TODO:
    #setup FeedInfo item
    #and also create a Source item

    permit_sub_types = []
    status_types = []
    building_nums = []
    applicants = []
    managers = []

    cache_file = "%s.json" % city.tag
    cache_destination = os.path.join(os.path.dirname(source), cache_file)
    #keep a local copy of data we've processed...
    #this should help with subsequent calls
    #to make sure we don't need to duplicate calls to remote geolocation APIs:
    local_cache = load_json(cache_destination, create=True)
    if not local_cache.has_key('buildings'):
        local_cache['buildings'] = {}
    if not local_cache.has_key('parcels'):
        local_cache['parcels'] = {}

    locations = {}
    for key, value in local_cache['buildings'].items():
        locations[key] = Location(value)

    #geocoder helper:
    geo = Geo()

    #with open('eggs.csv', 'rb') as csvfile:
    with codecs.open(source, 'rb', encoding='utf-8') as csvfile:
        #reader = csv.reader(csvfile, delimiter=' ', quotechar='|')
        reader = csv.reader(csvfile)

        #just print the first row:
        print '>, <'.join(reader.next())

        count = 0
        for row in reader:
            count += 1
            #could exit out early here, if needed
            if count > 10:
                pass

            print row

            #type of building (eg: sf attached, duplex, etc)
            permit_id = row[0]

            #should always be "RENTAL" (don't need to track this one)
            permit_type = row[1]
            if not permit_type == "RENTAL" and not permit_type == "MECHANICAL":
                raise ValueError, "Unexpected permit type: %s in row: %s" % (
                    permit_type, row)

            sub_type = row[2]

            #can use this to filter out non-rental or obsolete entries
            #don't need to track otherwise:
            status = row[3]
            parcel_id = row[4]
            address = row[5]

            #should be fixed per source:
            city = row[6]
            if not ((city.lower() == 'ann arbor') or (city == '')):
                raise ValueError, "Unexpected city: %s" % (city)

            sqft = row[7]
            number_of_buildings = row[8]
            applicant_name = row[9]
            number_of_stories = row[10]
            number_of_units = row[11]

            if (not status in ['EXPIRED', 'CLOSED']) and (permit_type
                                                          in ['RENTAL']):
                #check if we've started processing any results for this row
                #if local_cache['buildings'].has_key(address.upper()):
                #    local_cache_cur = local_cache['buildings'][address.upper()]
                #else:
                #    local_cache_cur = {}

                if locations.has_key(address.upper()):
                    location = locations[address.upper()]
                else:
                    location = Location()

                #do some geocoding, as needed:
                search = "%s, Ann Arbor MI" % address.upper()

                for source in location.sources:
                    geo.lookup(search, source, location)

                location.address_alt = search

                locations[address.upper()] = location

                #local_cache['buildings'][address.upper()] = local_cache_cur

                #and check if a previous building object in the db exists
                #CREATE A NEW BUILDING OBJECT HERE
                #cur_building = Building()
                bldg = Building()
                bldg.type = sub_type

            #back it up for later
            local_cache['buildings'] = {}
            for key, value in locations.items():
                local_cache['buildings'][key] = value.to_dict()

            save_json(cache_destination, local_cache)
            #exit()

            #THE FOLLOWING ARE FOR INFORMATIONAL PURPOSES ONLY
            #(to see what data is available)

            if not status in status_types:
                #print "adding: %s" % sub_type
                status_types.append(status)

            if not sub_type in permit_sub_types:
                #print "adding: %s" % sub_type
                permit_sub_types.append(sub_type)

            building_num = row[8]
            if not building_num in building_nums:
                #print "adding: %s" % sub_type
                building_nums.append(building_num)

            applicant = row[9]
            if (re.search('MGMT', applicant) or re.search('REALTY', applicant)
                    or re.search('PROPERTIES', applicant)
                    or re.search('MANAGEMENT', applicant)
                    or re.search('GROUP', applicant)
                    or re.search('LLC', applicant)
                    or re.search('L.L.C.', applicant)
                    or re.search('INC', applicant)):
                if not applicant in managers:
                    managers.append(applicant)
            else:
                if not applicant in applicants:
                    applicants.append(applicant)

            #print ', '.join(row)
            #print

    ## print permit_sub_types
    print status_types
    print building_nums

    save_results(locations)
def read_csv(source_csv, city_name, city_tag):
    city_options = City.objects.filter(tag=city_tag)
    print "Number of cities available: %s" % len(city_options)
    if not len(city_options):
        raise ValueError, "CITY NOT FOUND! run make_cities.py first"
        ## city = City()
        ## city.name = city_name
        ## city.tag = to_tag(city.name)
        ## city.save()
    else:
        city = city_options[0]

    print city

    feed_date = "2013-07-31"

    feeds = FeedInfo.objects.filter(city=city).filter(added=feed_date)
    if feeds.exists():
        feed = feeds[0]
        print "Already had feed: %s, %s" % (feed.city, feed.added)
    else:
        feed = FeedInfo()
        feed.city = city
        feed.added = feed_date
        feed.version = "0.1"
        feed.save()
        print "Created new feed: %s" % feed.city.name

    people = Person.objects.filter(name="Blank")
    if people.exists():
        person = people[0]
        print "Already had person: %s" % (person.name)
    else:
        person = Person()
        person.name = "Blank"
        person.save()
        print "Created new person: %s" % person.name

    sources = Source.objects.filter(feed=feed)
    if sources.exists():
        feed_source = sources[0]
        print "Already had source: %s, %s" % (feed_source.feed.city, feed_source.feed.added)
    else:
        feed_source = Source()
        feed_source.feed = feed
        feed_source.person = person
        feed_source.save()
        print "Created new source: %s" % feed_source.feed.city.name


    cache_file = "%s.json" % city.tag
    cache_destination = os.path.join(os.path.dirname(source_csv), cache_file)
    #keep a local copy of data we've processed...
    #this should help with subsequent calls
    #to make sure we don't need to duplicate calls to remote geolocation APIs:
    local_cache = load_json(cache_destination, create=True)
    if not local_cache.has_key('buildings'):
        local_cache['buildings'] = {}
    if not local_cache.has_key('parcels'):
        local_cache['parcels'] = {}
    
    locations = {}
    for key, value in local_cache['buildings'].items():
        locations[key] = Location(value)

    #geocoder helper:
    geo = Geo()

    skips = 0
    #with codecs.open(source_csv, 'rb', encoding='utf-8') as csvfile:
    with open(source_csv) as csvfile:
        #reader = csv.reader(csvfile, delimiter=' ', quotechar='|')
        #reader = csv.reader(csvfile)
        #reader = unicodecsv.UnicodeReader(csvfile, encoding='utf-8')

        reader = unicode_csv_reader(csvfile)

        #just print the first row:
        print '>, <'.join(reader.next())

        count = 0
        for row in reader:
            count += 1
            print "Looking at row: %s" % count
            
            #could exit out early here, if needed
            if count > 1000:
                #exit()
                pass
            
            address = row[0]

            #need to fix the number being at the end of the address
            parts = address.split(',')
            anumber = parts[-1]
            parts = parts[:-1]
            street = ",".join(parts)
            address = "%s %s" % (anumber, street)


            invoice_number = row[1]
            bldg_id = row[1]
            print bldg_id

            #this is where owner is stored
            invoice_note = row[6]
            print invoice_note
            if re.match('Sent to:', invoice_note):
                print "changing invoice note from: %s" % invoice_note
                invoice_note = invoice_note[8:]
                print "to: %s" % invoice_note
            else:
                #raise ValueError, "invoice note does not start with Sent to"
                print "!!!!!invoice note does not start with Sent to!!!!!"
                print ""
                print ""

            no_units = row[12]
            
            ## #should always be "RENTAL" (don't need to track this one)
            ## permit_type = row[1]
            ## if not permit_type == "RENTAL" and not permit_type == "MECHANICAL":
            ##     raise ValueError, "Unexpected permit type: %s in row: %s" % (
            ##         permit_type, row)
            
            ## bldg_type = row[2]
            
            ## #can use this to filter out non-rental or obsolete entries
            ## #don't need to track otherwise:
            ## status = row[3]
            ## parcel_id = row[4]

            ## #should be fixed per source:
            ## ss_city = row[6]

            ## bldg_sf = row[7]
            ## no_bldgs = row[8]
            ## applicant_name = row[9]
            ## no_stories = row[10]
            ## no_units = row[11]

            ## if not ( (ss_city.lower() == city_name.lower()) or (ss_city == '') ):
            ##     raise ValueError, "Unexpected city: %s" % (ss_city)

            ## sqft = row[7]
            ## number_of_buildings = row[8]
            ## applicant_name = row[9]
            ## number_of_stories = row[10]
            ## number_of_units = row[11]
            
            #check if this is one we want to skip
            if conversions.has_key(address.upper()):
                address = conversions[address.upper()]

            ## if (not status in ['EXPIRED', 'CLOSED']) and (permit_type in ['RENTAL']):

            #make sure it's not one we're skipping:
            if not address:
                print "SKIPPING ITEM: %s" % row[1]
                skips += 1
            else:
                #check if we've started processing any results for this row
                if locations.has_key(address.upper()):
                    location = locations[address.upper()]
                else:
                    location = Location()

            #temporarily just want to look at google again
            #location.sources = ["google"]
            #location.sources = ["google", "bing", "usgeo", "geonames", "openmq"]
            location.sources = ["google", "bing"]

            #do some geocoding, as needed:
            search = "%s, %s, %s" % (address.upper(), city_name, city.state)

            any_updated = False
            for geo_source in location.sources:
                update = geo.lookup(search, geo_source, location, force=True)
                #update = geo.lookup(search, geo_source, location, force=False)
                if update:
                    any_updated = True

            location.sources = ["google", "bing", "usgeo", "geonames", "openmq", "mq"]

            #this is the case for brand new searches
            #(which are updated in a different sense)
            if not hasattr(location, "address_alt") or not location.address_alt:
                any_updated = True

            location.address_alt = search
            #location.bldg_units = bldg_units
            #location.units_bdrms = units_bdrms
            locations[address.upper()] = location

            #handle the database storage
            bldg = make_building(location, bldg_id, city, feed_source, no_units=no_units)

            if invoice_note:
                (person, bldg_person) = make_person(invoice_note, bldg, "Permit Applicant")

            if any_updated:
                #back it up for later
                #enable this when downloading GPS coordinates...
                #the rest of the time it slows things down
                local_cache['buildings'] = {}
                for key, value in locations.items():
                    local_cache['buildings'][key] = value.to_dict()
                save_json(cache_destination, local_cache)

            print

    destination = '%s.tsv' % city_tag
    save_results(locations, destination)
def read_csv(source):
    #for reading unicode
    #f = codecs.open(source, 'r', encoding='utf-8')

    city_options = City.objects.filter(tag="ann_arbor")
    print len(city_options)
    if not len(city_options):
        city = City()
        city.name = "Ann Arbor"
        city.tag = to_tag(city.name)
        city.save()
    else:
        city = city_options[0]

    print city

    #TODO:
    #setup FeedInfo item
    #and also create a Source item

    permit_sub_types = []
    status_types = []
    building_nums = []
    applicants = []
    managers = []

    cache_file = "%s.json" % city.tag
    cache_destination = os.path.join(os.path.dirname(source), cache_file)
    #keep a local copy of data we've processed...
    #this should help with subsequent calls
    #to make sure we don't need to duplicate calls to remote geolocation APIs:
    local_cache = load_json(cache_destination, create=True)
    if not local_cache.has_key('buildings'):
        local_cache['buildings'] = {}
    if not local_cache.has_key('parcels'):
        local_cache['parcels'] = {}
    
    locations = {}
    for key, value in local_cache['buildings'].items():
        locations[key] = Location(value)
    

    #geocoder helper:
    geo = Geo()
    
    #with open('eggs.csv', 'rb') as csvfile:
    with codecs.open(source, 'rb', encoding='utf-8') as csvfile:
        #reader = csv.reader(csvfile, delimiter=' ', quotechar='|')
        reader = csv.reader(csvfile)

        #just print the first row:
        print '>, <'.join(reader.next())

        count = 0
        for row in reader:
            count += 1
            #could exit out early here, if needed
            if count > 10:
                pass

            print row
            
            #type of building (eg: sf attached, duplex, etc)
            permit_id = row[0]

            #should always be "RENTAL" (don't need to track this one)
            permit_type = row[1]
            if not permit_type == "RENTAL" and not permit_type == "MECHANICAL":
                raise ValueError, "Unexpected permit type: %s in row: %s" % (
                    permit_type, row)
            
            sub_type = row[2]
            
            #can use this to filter out non-rental or obsolete entries
            #don't need to track otherwise:
            status = row[3]
            parcel_id = row[4]
            address = row[5]

            #should be fixed per source:
            city = row[6]
            if not ( (city.lower() == 'ann arbor') or (city == '') ):
                raise ValueError, "Unexpected city: %s" % (city)

            sqft = row[7]
            number_of_buildings = row[8]
            applicant_name = row[9]
            number_of_stories = row[10]
            number_of_units = row[11]
            
            if (not status in ['EXPIRED', 'CLOSED']) and (permit_type in ['RENTAL']):
                #check if we've started processing any results for this row
                #if local_cache['buildings'].has_key(address.upper()):
                #    local_cache_cur = local_cache['buildings'][address.upper()]
                #else:
                #    local_cache_cur = {}

                if locations.has_key(address.upper()):
                    location = locations[address.upper()]
                else:
                    location = Location()

                #do some geocoding, as needed:
                search = "%s, Ann Arbor MI" % address.upper()

                for source in location.sources:
                    geo.lookup(search, source, location)

                location.address_alt = search

                locations[address.upper()] = location

                #local_cache['buildings'][address.upper()] = local_cache_cur
                

                #and check if a previous building object in the db exists
                #CREATE A NEW BUILDING OBJECT HERE
                #cur_building = Building()
                bldg = Building()
                bldg.type = sub_type
                


            #back it up for later
            local_cache['buildings'] = {}
            for key, value in locations.items():
                local_cache['buildings'][key] = value.to_dict()
            
            save_json(cache_destination, local_cache)
            #exit()

            #THE FOLLOWING ARE FOR INFORMATIONAL PURPOSES ONLY
            #(to see what data is available)

            if not status in status_types:
                #print "adding: %s" % sub_type
                status_types.append(status)


            if not sub_type in permit_sub_types:
                #print "adding: %s" % sub_type
                permit_sub_types.append(sub_type)

            building_num = row[8]
            if not building_num in building_nums:
                #print "adding: %s" % sub_type
                building_nums.append(building_num)


            applicant = row[9]
            if ( re.search('MGMT', applicant) or
                 re.search('REALTY', applicant) or 
                 re.search('PROPERTIES', applicant) or 
                 re.search('MANAGEMENT', applicant) or 
                 re.search('GROUP', applicant) or 
                 re.search('LLC', applicant) or 
                 re.search('L.L.C.', applicant) or 
                 re.search('INC', applicant)
                 ):
                if not applicant in managers:
                    managers.append(applicant)
            else:
                if not applicant in applicants:
                    applicants.append(applicant)
            
            

            #print ', '.join(row)
            #print

    ## print permit_sub_types
    print status_types
    print building_nums

    save_results(locations)
def read_csv(source_csv, city_name, city_tag, driver):
    city_options = City.objects.filter(tag=city_tag)
    print "Number of cities available: %s" % len(city_options)
    if not len(city_options):
        raise ValueError, "CITY NOT FOUND! run make_cities.py first"
        ## city = City()
        ## city.name = city_name
        ## city.tag = to_tag(city.name)
        ## city.save()
    else:
        city = city_options[0]

    print city

    position_file = "position.json"
    position = load_json(position_file, create=True)
    if not position:
        position = 0

    cache_file = "%s-20150525.json.bkup" % city.tag
    cache_destination = os.path.join(os.path.dirname(source_csv), cache_file)
    #keep a local copy of data we've processed...
    #this should help with subsequent calls
    #to make sure we don't need to duplicate calls to remote geolocation APIs:
    local_cache = load_json(cache_destination, create=True)
    if not local_cache.has_key('buildings'):
        local_cache['buildings'] = {}
    
    search_results = {}
    for key, value in local_cache['buildings'].items():
        #search_results[key] = Location(value)
        sr = SearchResults()
        sr.from_dict(value)
        #print
        #print sr
        #print 
        search_results[key] = sr

    #geocoder helper:
    #geo = Geo()

    provider = ''
    provider_options = ServiceProvider.objects.filter(name='City of Columbia')
    if len(provider_options):
        provider = provider_options[0]
    else:
        raise ValueError, "error finding utility_provider: %s matches" % len(provider_options)                    


    skips = 0
    with open(source_csv) as csvfile:

        reader = unicode_csv_reader(csvfile)

        #just print the first row:
        print '>, <'.join(reader.next())

        count = 0

        #want to randomize the order... distribute options more evenly
        #print len(reader)
        #exit()
        #in order to randomize, should randomize the order in the csv
        for row in reader:
            count += 1
            print "Looking at row: %s, position: %s" % (count, position)
            start = datetime.now()
            print "Started: ", start
            
            any_updated = False
            
            #could exit out early here, if needed
            if count > 10:
                #exit()
                pass

            #if you want to skip ahead more quickly:
            #if count < 0:
            if count < position:
                pass
            else:

                #print row
                objectid = row[0]


                ## no_units = row[12]


                #can pass this in as bldg_id to make_building
                #that gets used for parcel too
                parcel_id = row[1]
                bldg_id = parcel_id

                street_num = row[2]
                street_dir = row[3]
                street_name = row[4]
                street_sfx = row[5]
                #eg building number
                qualifier_pre = row[6]
                #eg "UNIT" or "APT"
                qualifier_post = row[7]
                apt_num = row[8]
                #skip row9 (in/out... whatever that means)
                zip_code = row[10]
                #skip row11, assessor id
                #skip row12, address num
                #skip row13, x
                #skip row14, y
                #xcoord == lng
                lng = row[15]
                lat = row[16]

                #entry floor number: (named 'z' in sheet)
                floor = row[17]

                #skip row18, strcid... not sure
                #skip row19, parent
                #skip row20, app_
                #skip row21, hteloc
                zone = row[22]
                bldg_type = row[23]
                #number of buildings
                bldg_num = row[24]
                no_units = row[25]

                #skip row[26], inspection type
                #skip row27, app number
                #skip row28, date received
                #skip row29, application type
                #skip row30, ownerid
                #skip row31, operator id
                #skip row32, agent_id
                #skip row33, mail to
                central_heat = row[34]
                if central_heat == 'Y':
                    central_heat = True
                else:
                    central_heat = False

                #heat mechanism? heat mechanic??? not sure
                heat_mech = row[35]
                #skip row36, agent id (2)
                #skip row37, agent last name
                #skip row38 agent first name
                #skip row39 agent middle initial
                #skip row40, agent title
                #skip row41, business name

                #could be owner, could be agent
                ## owner_name = row[42]
                ## owner_address1 = row[43]
                ## owner_address2 = row[44]
                ## owner_city = row[45]
                ## owner_state = row[46]
                ## owner_zip = row[47]


                #address = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre, qualifier_post, apt_num])

                #this is causing problems with lookups in google
                if qualifier_pre == "DUP" or qualifier_pre == "DUPE" or qualifier_pre == "2-Jan" or qualifier_pre == "HM" or qualifier_pre == "DWN":
                    qualifier_pre = ''

                address_main = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre])
                address_main = address_main.strip()
                #get rid of any double spaces
                address_main = address_main.replace("  ", " ")

                #similar to conversions,
                #but there are too many of these to list there
                if re.search('HOLLY RIDGE LN', address_main):
                    address_main = address_main.replace('HOLLY RIDGE LN', 'HOLLYRIDGE LN')
                if re.search('BERKSHIRE CT', address_main):
                    address_main = address_main.replace('BERKSHIRE CT', 'BERKSHIRE')
                    #address_main = ''
                if re.search('CAMERON CT', address_main):
                    address_main = address_main.replace('CAMERON CT', 'CAMERON')
                    #address_main = ''
                if re.search('ATHENS CT', address_main):
                    address_main = address_main.replace('ATHENS CT', 'ATHENS')
                    #address_main = ''
                if re.search('LAMAR CT', address_main):
                    address_main = address_main.replace('LAMAR CT', 'LAMAR')
                    #address_main = ''
                if re.search('MONITEAU CT', address_main):
                    address_main = address_main.replace('MONITEAU CT', 'MONITEAU')
                    #address_main = ''
                if re.search('IMPERIAL CT', address_main):
                    address_main = ''
                if re.search('PERKINS DR', address_main):
                    address_main = ''
                if re.search('GRANITE OAKS CT', address_main):
                    address_main = ''

                    

                #sometimes the 'BLDG' data is added in the wrong place
                #then it gets treated as a unit item
                #(but it's not *always* a unit item, so can't generalize it that way)
                if qualifier_post == "BLDG" or qualifier_post == "LOT":
                    address_main = " ".join([address_main, qualifier_post, apt_main])
                    address_main = address_main.strip()
                    apt_main = ''
                else:
                    apt_main = " ".join([qualifier_post, apt_num])
                    apt_main = apt_main.strip()

                #check if this is one we want to skip
                if conversions.has_key(address_main.upper()):
                    address_main = conversions[address_main.upper()]

                if address_main:
                    print "APT_MAIN: ", apt_main
                    address = ", ".join( [address_main, apt_main] )


                ## if (not status in ['EXPIRED', 'CLOSED']) and (permit_type in ['RENTAL']):

                print "Parcel ID:", parcel_id
                print address

                results = None

                #make sure it's not one we're skipping:
                if not address:
                    print "SKIPPING ITEM: %s" % row[1]
                    skips += 1

                    ## skips = codecs.open("skips.txt", 'a', encoding='utf-8')
                    ## original = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre])
                    ## skips.write(original)
                    ## skips.write('\n')
                    ## skips.close()
                    
                #check if we've started processing any results for this row
                elif not search_results.has_key(address.upper()):
                    print "No saved search results for address: %s" % address
                    print "Skipping."
                    print
                    #raise ValueError, "No results found for %s" % address

                else:
                    
                    print "Already had building: %s" % address
                    results = search_results[address.upper()]

                    assert results
                    #print results

                    lookup_building_with_geo(results, make=True, parcel_id=parcel_id)
                    #print results
                    #current['results'] = results

                    #print results

                    if results.errors:
                        print results
                        raise ValueError, results.errors
                    else:

                        bldg = results.building
                        assert bldg
                        unit = results.unit

                        #at this point there should be at least one unit
                        #and we will want to associate results with that unit
                        #assert unit
                        # can just pass this up in this case

                        if not unit:
                            print "Skipping address... no matching Unit!"

                        else:


                            #now that we have a building
                            #look up energy data on the remote website

                            #result = urllib2.urlopen("http://example.com/foo/bar")
                            #print result.read()

                            ## base = "http://www.gocolumbiamo.com/cfforms/ub/rental.html"
                            ## driver.get(base)
                            ## search = driver.find_element_by_css_selector('#address')
                            ## search.send_keys(address)
                            ## button = driver.find_element_by_css_selector('.ui-bar > a:nth-child(2)')
                            ## #button = driver.find_element_by_css_selector('#PrimaryCenterColumn > div > div.ui-bar-b.ui-header > div > a.ui-btn.ui-btn-corner-all.ui-shadow.ui-btn-down-b.ui-btn-up-b')
                            ## #button = driver.find_element_by_css_selector('#PrimaryCenterColumn > div > div.ui-bar-b.ui-header > div > a.ui-btn.ui-btn-corner-all.ui-shadow.ui-btn-down-b.ui-btn-up-b > span > span')
                            ## button.click()
                            ## time.sleep(4)

                            ## #results = driver.find_element_by_css_selector('.dojoxGridMasterView')
                            ## results = driver.find_element_by_css_selector('.dojoxGridContent > div:nth-child(1)')
                            ## print results.get_attribute('innerHTML')
                            ## print parcel_id

                            ## options = results.find_elements_by_tag_name('div')
                            ## #options = results.find_elements_by_link_text(parcel_id)
                            ## print options
                            ## #something didn't work with this:
                            ## #look_for = '<td tabindex="-1" role="gridcell" colspan="1" class="dojoxGridCell" idx="0" style="width:90px;">%s</td>' % parcel_id
                            ## look_for = '>%s<' % parcel_id

                            ## matches = []
                            ## for option in options:
                            ##     markup = option.get_attribute('innerHTML')
                            ##     #print markup
                            ##     if re.search(look_for, markup):
                            ##         matches.append(option)
                            ##         #print "MATCH!"

                            ## if len(matches) > 1:
                            ##     print matches
                            ##     raise ValueError, "Too many matches!"
                            ## else:
                            ##     matches[0].click()


                            #just realized that this form uses the property_id
                            #which we already have...
                            #can skip the steps above that are trying to make this link:

                            base = "http://www.gocolumbiamo.com/cfforms/ub/ubdata.cfm?LOCID=%s&AppNum=79" % parcel_id
                            driver.get(base)

                            try:
                                heat_source = driver.find_element_by_css_selector('#PrimaryCenterColumn > table:nth-child(1) > tbody:nth-child(1) > tr:nth-child(3) > td:nth-child(1) > strong:nth-child(1) > font:nth-child(1)')
                                if heat_source.text.strip() == "Heating Source: Gas Heat":
                                    bldg.heat_source_details = 'gas'
                                    bldg.save()
                                else:
                                    print heat_source.text
                                    exit()
                                    #TODO:
                                    bldg.heat_source_details = 'electric'
                                    bldg.who_pays_gas = 'not_available'
                            except:
                                print "heat source not found... skipping"
                                    
                            try:
                                selector = driver.find_element_by_css_selector('#el_table_length > label:nth-child(1) > select:nth-child(1) > option:nth-child(3)')
                                selector.click()
                            except:
                                print "No Water data available... skipping"
                            else:

                                body = driver.find_element_by_css_selector('#el_table > tbody:nth-child(3)')
                                rows = body.find_elements_by_tag_name('tr')
                                #row = rows[0]
                                query = bldg.utilitysummary_set.filter(type='electricity')
                                for row in rows:
                                    #print row.get_attribute('innerHTML')
                                    cols = row.find_elements_by_tag_name('td')
                                    date = cols[0].text + '-01'
                                    cost = cols[1].text.replace('$', '').strip()
                                    amount = cols[2].text
                                    amount = amount.replace(' KWH', '')
                                    update_summary(query, date, cost, amount, bldg, unit, provider, 'electricity', 'kwh')
                                    #update_summary(query, date, cost, amount)
                                    #for item in cols:
                                    #    print item.text


                            #print dir(bldg)
                            #print bldg.utilitysummary_set
                            #query = bldg.utilitysummary_set.filter(type=utility_type[0])
                            #could look up type from UTILITY_TYPES...
                            #but in this case we know what they should be
                            #query = bldg.utilitysummary_set.filter(type='water')
                            #if len(query):

                            try:
                                water = driver.find_element_by_css_selector('#ext-gen23')
                                water.click()

                                selector = driver.find_element_by_css_selector('#wr_table_length > label:nth-child(1) > select:nth-child(1) > option:nth-child(3)')
                                selector.click()
                            except:
                                print "No Water data available... skipping"
                            else:

                                body = driver.find_element_by_css_selector('#wr_table > tbody:nth-child(3)')

                                rows = body.find_elements_by_tag_name('tr')
                                #row = rows[0]
                                query = bldg.utilitysummary_set.filter(type='water')
                                for row in rows:
                                    #print row.get_attribute('innerHTML')
                                    cols = row.find_elements_by_tag_name('td')
                                    date = cols[0].text + '-01'
                                    cost = cols[1].text.replace('$', '').strip()
                                    amount = cols[2].text
                                    amount = amount.replace(' CCF', '')
                                    update_summary(query, date, cost, amount, bldg, unit, provider, 'water', 'ccf')
                                    #update_summary(query, date, cost, amount)
                                    #for item in cols:
                                    #    print item.text


                            unit.update_averages()

                            #see if we have enough info now to make a score:
                            unit.update_energy_score()

                            #now that we've saved the unit,
                            #update the averages for the whole building:
                            unit.building.update_utility_averages()
                            unit.building.update_rent_details()

                
                position += 1
                save_json(position_file, position)
        
            if any_updated:
                #back it up for later
                #enable this when downloading GPS coordinates...
                #the rest of the time it slows things down
                local_cache['buildings'] = {}
                for key, value in search_results.items():
                    #search_results[key] = SearchResults().from_dict(value)
                    local_cache['buildings'][key] = value.to_dict()
                save_json(cache_destination, local_cache)

                position = count
                save_json(position_file, position)
                exit()

            end = datetime.now()
            print "finished: ", end
            total_time = end - start
            print total_time

            print
Пример #16
0
def read_csv(source_csv, city_name, city_tag):
    city_options = City.objects.filter(tag=city_tag)
    print "Number of cities available: %s" % len(city_options)
    if not len(city_options):
        raise ValueError, "CITY NOT FOUND! run make_cities.py first"
        ## city = City()
        ## city.name = city_name
        ## city.tag = to_tag(city.name)
        ## city.save()
    else:
        city = city_options[0]

    print city

    feed_date = "2013-07-31"

    feeds = FeedInfo.objects.filter(city=city).filter(added=feed_date)
    if feeds.exists():
        feed = feeds[0]
        print "Already had feed: %s, %s" % (feed.city, feed.added)
    else:
        feed = FeedInfo()
        feed.city = city
        feed.added = feed_date
        feed.version = "0.1"
        feed.save()
        print "Created new feed: %s" % feed.city.name

    people = Person.objects.filter(name="Blank")
    if people.exists():
        person = people[0]
        print "Already had person: %s" % (person.name)
    else:
        person = Person()
        person.name = "Blank"
        person.save()
        print "Created new person: %s" % person.name

    sources = Source.objects.filter(feed=feed)
    if sources.exists():
        feed_source = sources[0]
        print "Already had source: %s, %s" % (feed_source.feed.city,
                                              feed_source.feed.added)
    else:
        feed_source = Source()
        feed_source.feed = feed
        feed_source.person = person
        feed_source.save()
        print "Created new source: %s" % feed_source.feed.city.name

    cache_file = "%s.json" % city.tag
    cache_destination = os.path.join(os.path.dirname(source_csv), cache_file)
    #keep a local copy of data we've processed...
    #this should help with subsequent calls
    #to make sure we don't need to duplicate calls to remote geolocation APIs:
    local_cache = load_json(cache_destination, create=True)
    if not local_cache.has_key('buildings'):
        local_cache['buildings'] = {}
    if not local_cache.has_key('parcels'):
        local_cache['parcels'] = {}

    locations = {}
    for key, value in local_cache['buildings'].items():
        locations[key] = Location(value)

    #geocoder helper:
    geo = Geo()

    skips = 0
    #with codecs.open(source_csv, 'rb', encoding='utf-8') as csvfile:
    with open(source_csv) as csvfile:
        #reader = csv.reader(csvfile, delimiter=' ', quotechar='|')
        #reader = csv.reader(csvfile)
        #reader = unicodecsv.UnicodeReader(csvfile, encoding='utf-8')

        reader = unicode_csv_reader(csvfile)

        #just print the first row:
        print '>, <'.join(reader.next())

        count = 0
        for row in reader:
            count += 1
            print "Looking at row: %s" % count

            #could exit out early here, if needed
            if count > 1000:
                #exit()
                pass

            address = row[0]

            #need to fix the number being at the end of the address
            parts = address.split(',')
            anumber = parts[-1]
            parts = parts[:-1]
            street = ",".join(parts)
            address = "%s %s" % (anumber, street)

            invoice_number = row[1]
            bldg_id = row[1]
            print bldg_id

            #this is where owner is stored
            invoice_note = row[6]
            print invoice_note
            if re.match('Sent to:', invoice_note):
                print "changing invoice note from: %s" % invoice_note
                invoice_note = invoice_note[8:]
                print "to: %s" % invoice_note
            else:
                #raise ValueError, "invoice note does not start with Sent to"
                print "!!!!!invoice note does not start with Sent to!!!!!"
                print ""
                print ""

            no_units = row[12]

            ## #should always be "RENTAL" (don't need to track this one)
            ## permit_type = row[1]
            ## if not permit_type == "RENTAL" and not permit_type == "MECHANICAL":
            ##     raise ValueError, "Unexpected permit type: %s in row: %s" % (
            ##         permit_type, row)

            ## bldg_type = row[2]

            ## #can use this to filter out non-rental or obsolete entries
            ## #don't need to track otherwise:
            ## status = row[3]
            ## parcel_id = row[4]

            ## #should be fixed per source:
            ## ss_city = row[6]

            ## bldg_sf = row[7]
            ## no_bldgs = row[8]
            ## applicant_name = row[9]
            ## no_stories = row[10]
            ## no_units = row[11]

            ## if not ( (ss_city.lower() == city_name.lower()) or (ss_city == '') ):
            ##     raise ValueError, "Unexpected city: %s" % (ss_city)

            ## sqft = row[7]
            ## number_of_buildings = row[8]
            ## applicant_name = row[9]
            ## number_of_stories = row[10]
            ## number_of_units = row[11]

            #check if this is one we want to skip
            if conversions.has_key(address.upper()):
                address = conversions[address.upper()]

            ## if (not status in ['EXPIRED', 'CLOSED']) and (permit_type in ['RENTAL']):

            #make sure it's not one we're skipping:
            if not address:
                print "SKIPPING ITEM: %s" % row[1]
                skips += 1
            else:
                #check if we've started processing any results for this row
                if locations.has_key(address.upper()):
                    location = locations[address.upper()]
                else:
                    location = Location()

            #temporarily just want to look at google again
            #location.sources = ["google"]
            #location.sources = ["google", "bing", "usgeo", "geonames", "openmq"]
            location.sources = ["google", "bing"]

            #do some geocoding, as needed:
            search = "%s, %s, %s" % (address.upper(), city_name, city.state)

            any_updated = False
            for geo_source in location.sources:
                update = geo.lookup(search, geo_source, location, force=True)
                #update = geo.lookup(search, geo_source, location, force=False)
                if update:
                    any_updated = True

            location.sources = [
                "google", "bing", "usgeo", "geonames", "openmq", "mq"
            ]

            #this is the case for brand new searches
            #(which are updated in a different sense)
            if not hasattr(location,
                           "address_alt") or not location.address_alt:
                any_updated = True

            location.address_alt = search
            #location.bldg_units = bldg_units
            #location.units_bdrms = units_bdrms
            locations[address.upper()] = location

            #handle the database storage
            bldg = make_building(location,
                                 bldg_id,
                                 city,
                                 feed_source,
                                 no_units=no_units)

            if invoice_note:
                (person, bldg_person) = make_person(invoice_note, bldg,
                                                    "Permit Applicant")

            if any_updated:
                #back it up for later
                #enable this when downloading GPS coordinates...
                #the rest of the time it slows things down
                local_cache['buildings'] = {}
                for key, value in locations.items():
                    local_cache['buildings'][key] = value.to_dict()
                save_json(cache_destination, local_cache)

            print

    destination = '%s.tsv' % city_tag
    save_results(locations, destination)
Пример #17
0
        #do some geocoding, as needed:
        search = "%s %s" % (city_name, city_state)

        any_updated = False
        for geo_source in location.sources:
            update = geo.lookup(search, geo_source, location, force=True)
            if update:
                any_updated = True

            result = location.get_source(geo_source)
            print len(result)
            print result
            city.latitude = result[0]['lat']
            city.longitude = result[0]['lng']

        location.sources = [
            "google", "bing", "usgeo", "geonames", "openmq", "mq"
        ]

        saved_cities[city_tag] = {
            "name": city.name,
            "state": city.state,
            "tag": city.tag,
            "lat": city.latitude,
            "lng": city.longitude
        }

        save_json(cache_destination, saved_cities)

    city.save()
Пример #18
0
y_train = to_categorical(y_train)

# Reshape data for input to Dense layer
X_train = X_train.reshape(-1, 28 * 28)
X_test = X_test.reshape(-1, 28 * 28)

# Create sequential 2-layer model
model = Sequential()
model.add(Dense(400, input_dim=28 * 28, activation='sigmoid'))
model.add(Dense(10, activation='softmax'))
model.compile(loss='categorical_crossentropy',
              optimizer=optimizers.SGD(),
              metrics=['accuracy'])

# Write the summary to file
save_summary(model, 'simple')

# Train model and evaluate training
results = model.fit(X_train.reshape(-1, 28 * 28),
                    y_train,
                    epochs=10,
                    batch_size=64,
                    validation_split=1 / 12)
training_eval(results, 'simple')

# Predict and evaluate performance
y_fit = model.predict(X_test, batch_size=128)
performance_eval('simple', y_fit.argmax(axis=1), y_test.argmax(axis=1))

save_json(model, 'simple')
model.save_weights('models/simple_weights.h5')
Пример #19
0
def read_csv(source_csv):
    city_options = City.objects.filter(tag="bloomington_in")
    print "Number of cities available: %s" % len(city_options)
    if not len(city_options):
        raise ValueError, "CITY NOT FOUND! run make_cities.py first"
        ## city = City()
        ## city.name = "Bloomington"
        ## city.tag = to_tag(city.name)
        ## city.save()
    else:
        city = city_options[0]

    print city

    feed_date = "2013-08-29"

    feeds = FeedInfo.objects.filter(city=city).filter(added=feed_date)
    if feeds.exists():
        feed = feeds[0]
        print "Already had feed: %s, %s" % (feed.city, feed.added)
    else:
        feed = FeedInfo()
        feed.city = city
        feed.added = feed_date
        feed.version = "0.1"
        feed.save()
        print "Created new feed: %s" % feed.city

    people = Person.objects.filter(name="Blank")
    if people.exists():
        person = people[0]
        print "Already had person: %s" % (person.name)
    else:
        person = Person()
        person.name = "Blank"
        person.save()
        print "Created new person: %s" % person.name

    sources = Source.objects.filter(feed=feed)
    if sources.exists():
        feed_source = sources[0]
        print "Already had source: %s, %s" % (feed_source.feed.city, feed_source.feed.added)
    else:
        feed_source = Source()
        feed_source.feed = feed
        feed_source.person = person
        feed_source.save()
        print "Created new source: %s" % feed_source.feed.city

    cache_file = "%s.json" % city.tag
    cache_destination = os.path.join(os.path.dirname(source_csv), cache_file)
    # keep a local copy of data we've processed...
    # this should help with subsequent calls
    # to make sure we don't need to duplicate calls to remote geolocation APIs:
    local_cache = load_json(cache_destination, create=True)
    if not local_cache.has_key("buildings"):
        local_cache["buildings"] = {}
    if not local_cache.has_key("parcels"):
        local_cache["parcels"] = {}

    locations = {}
    for key, value in local_cache["buildings"].items():
        locations[key] = Location(value)

    # geocoder helper:
    geo = Geo()

    skips = 0
    with codecs.open(source_csv, "rb", encoding="utf-8") as csvfile:
        # reader = csv.reader(csvfile, delimiter=' ', quotechar='|')
        reader = csv.reader(csvfile)

        # just print the first row:
        print ">, <".join(reader.next())

        count = 0
        for row in reader:
            count += 1
            print "Looking at row: %s" % count

            # could exit out early here, if needed
            if count > 1000:
                # exit()
                pass

            bldg_id = row[0]
            print bldg_id

            address = row[1]
            print address

            owner = row[2]

            # skip this:
            ownder_contact = row[3]

            agent = row[4]

            bldg_units = row[9]
            print bldg_units

            units_bdrms = row[10]
            print units_bdrms

            # check if this is one we want to skip
            if conversions.has_key(address.upper()):
                address = conversions[address.upper()]

            # make sure it's not one we're skipping:
            if not address:
                print "SKIPPING ITEM: %s" % row[1]
                skips += 1
            else:
                if locations.has_key(address.upper()):
                    location = locations[address.upper()]
                else:
                    location = Location()

                # temporarily just want to look at google again
                location.sources = ["google"]

                # do some geocoding, as needed:
                search = "%s, Bloomington IN" % address.upper()

                any_updated = False
                for geo_source in location.sources:
                    update = geo.lookup(search, geo_source, location, force=True)
                    if update:
                        any_updated = True

                location.sources = ["google", "bing", "usgeo", "geonames", "openmq", "mq"]

                if not hasattr(location, "address_alt") or not location.address_alt:
                    any_updated = True

                location.address_alt = search
                location.bldg_units = bldg_units
                location.units_bdrms = units_bdrms
                locations[address.upper()] = location

                # handle the database storage
                bldg = make_building(location, bldg_id, city, feed_source)

                # owner_details = parse_person(owner)
                if owner:
                    result = special_cases(owner)
                    if result:
                        (owner_name, owner_address) = result
                    else:
                        (owner_name, owner_address, owner_phone, remainder) = parse_person(owner)
                        ## print "owner name: %s" % owner_name
                        ## print "owner address: %s" % owner_address
                        ## print ""

                        if owner_name:
                            (person, bldg_person) = make_person(owner_name, bldg, "Owner", address=owner_address)

                if agent and agent != "No Agent":
                    # agent_details = parse_person(agent)
                    (agent_name, agent_address, agent_phone, remainder) = parse_person(agent)
                    ## print "agent name: %s" % agent_name
                    ## print "agent address: %s" % agent_address
                    ## print ""

                    if agent_name:
                        (person, bldg_person) = make_person(agent_name, bldg, "Agent", address=agent_address, city=city)

                if any_updated:
                    # back it up for later
                    # enable this when downloading GPS coordinates...
                    # the rest of the time it slows things down
                    local_cache["buildings"] = {}
                    for key, value in locations.items():
                        local_cache["buildings"][key] = value.to_dict()
                    save_json(cache_destination, local_cache)

                print

    save_results(locations, "bloomington-filtered.tsv")
Пример #20
0
# FC6 Fully connected layer

model.add(Dense(84, activation='tanh'))

# Output layer

model.add(Dense(10, activation='softmax'))

# compile model
model.compile(loss='categorical_crossentropy',
              optimizer=optimizers.SGD(),
              metrics=['accuracy'])

# Write the summary to file
save_summary(model, 'lenet')

# Train model and evaluate training
results = model.fit(X_train,
                    y_train,
                    epochs=10,
                    batch_size=64,
                    validation_split=1 / 12)
training_eval(results, 'lenet')

# Predict and evaluate performance
y_fit = model.predict(X_test, batch_size=128)
performance_eval('lenet', y_fit.argmax(axis=1), y_test.argmax(axis=1))

save_json(model, 'lenet')
model.save_weights('models/lenet_weights.h5')
Пример #21
0
def read_csv(source_csv, city_name, city_tag):
    city_options = City.objects.filter(tag=city_tag)
    print "Number of cities available: %s" % len(city_options)
    if not len(city_options):
        raise ValueError, "CITY NOT FOUND! run make_cities.py first"
        ## city = City()
        ## city.name = city_name
        ## city.tag = to_tag(city.name)
        ## city.save()
    else:
        city = city_options[0]

    print city

    feed_date = "2013-10-16"

    feeds = FeedInfo.objects.filter(city=city).filter(added=feed_date)
    if feeds.exists():
        feed = feeds[0]
        print "Already had feed: %s, %s" % (feed.city, feed.added)
    else:
        feed = FeedInfo()
        feed.city = city
        feed.added = feed_date
        feed.version = "0.1"
        feed.save()
        print "Created new feed: %s" % feed.city.name

    people = Person.objects.filter(name="Blank")
    if people.exists():
        person = people[0]
        print "Already had person: %s" % (person.name)
    else:
        person = Person()
        person.name = "Blank"
        person.save()
        print "Created new person: %s" % person.name

    sources = Source.objects.filter(feed=feed)
    if sources.exists():
        feed_source = sources[0]
        print "Already had source: %s, %s" % (feed_source.feed.city,
                                              feed_source.feed.added)
    else:
        feed_source = Source()
        feed_source.feed = feed
        feed_source.person = person
        feed_source.save()
        print "Created new source: %s" % feed_source.feed.city.name

    cache_file = "%s.json" % city.tag
    cache_destination = os.path.join(os.path.dirname(source_csv), cache_file)
    #keep a local copy of data we've processed...
    #this should help with subsequent calls
    #to make sure we don't need to duplicate calls to remote geolocation APIs:
    local_cache = load_json(cache_destination, create=True)
    if not local_cache.has_key('buildings'):
        local_cache['buildings'] = {}
    if not local_cache.has_key('parcels'):
        local_cache['parcels'] = {}

    locations = {}
    for key, value in local_cache['buildings'].items():
        locations[key] = Location(value)

    #geocoder helper:
    geo = Geo()

    skips = 0
    #with codecs.open(source_csv, 'rb', encoding='utf-8') as csvfile:
    with open(source_csv) as csvfile:
        #reader = csv.reader(csvfile, delimiter=' ', quotechar='|')
        #reader = csv.reader(csvfile)
        #reader = unicodecsv.UnicodeReader(csvfile, encoding='utf-8')

        reader = unicode_csv_reader(csvfile)

        #just print the first row:
        print '>, <'.join(reader.next())

        count = 0

        #want to randomize the order... distribute options more evenly
        #print len(reader)
        #exit()
        #in order to randomize, should randomize the order in the csv
        for row in reader:
            count += 1
            print "Looking at row: %s" % count

            #could exit out early here, if needed
            if count > 10:
                #exit()
                pass

            print row
            address = row[0]

            ## no_units = row[12]

            #can pass this in as bldg_id to make_building
            #that gets used for parcel too
            parcel_id = row[1]
            bldg_id = parcel_id

            street_num = row[2]
            street_dir = row[3]
            street_name = row[4]
            street_sfx = row[5]
            #eg building number
            qualifier_pre = row[6]
            #eg "UNIT" or "APT"
            qualifier_post = row[7]
            apt_num = row[8]
            #skip row9 (in/out... whatever that means)
            zip_code = row[10]
            #skip row11, assessor id
            #skip row12, address num
            #skip row13, x
            #skip row14, y
            #xcoord == lng
            lng = row[15]
            lat = row[16]

            #entry floor number: (named 'z' in sheet)
            floor = row[17]

            #skip row18, strcid... not sure
            #skip row19, parent
            #skip row20, app_
            #skip row21, hteloc
            zone = row[22]
            bldg_type = row[23]
            #number of buildings
            bldg_num = row[24]
            no_units = row[25]

            #skip row[26], inspection type
            #skip row27, app number
            #skip row28, date received
            #skip row29, application type
            #skip row30, ownerid
            #skip row31, operator id
            #skip row32, agent_id
            #skip row33, mail to
            central_heat = row[34]
            if central_heat == 'Y':
                central_heat = True
            else:
                central_heat = False

            #heat mechanism? heat mechanic??? not sure
            heat_mech = row[35]
            #skip row36, agent id (2)
            #skip row37, agent last name
            #skip row38 agent first name
            #skip row39 agent middle initial
            #skip row40, agent title
            #skip row41, business name

            #could be owner, could be agent
            owner_name = row[42]
            owner_address1 = row[43]
            owner_address2 = row[44]
            owner_city = row[45]
            owner_state = row[46]
            owner_zip = row[47]

            #address = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre, qualifier_post, apt_num])

            address_main = " ".join([
                street_num, street_dir, street_name, street_sfx, qualifier_pre
            ])
            address_main = address_main.strip()
            #get rid of any double spaces
            address_main = address_main.replace("  ", " ")

            apt_main = " ".join([qualifier_post, apt_num])
            apt_main = apt_main.strip()

            address = address_main
            print address

            owner_address = ", ".join([
                owner_address1, owner_address2, owner_city, owner_state,
                owner_zip
            ])

            ## #should always be "RENTAL" (don't need to track this one)
            ## permit_type = row[1]
            ## if not permit_type == "RENTAL" and not permit_type == "MECHANICAL":
            ##     raise ValueError, "Unexpected permit type: %s in row: %s" % (
            ##         permit_type, row)

            ## bldg_type = row[2]

            ## #can use this to filter out non-rental or obsolete entries
            ## #don't need to track otherwise:
            ## status = row[3]
            ## parcel_id = row[4]

            ## #should be fixed per source:
            ## ss_city = row[6]

            ## bldg_sf = row[7]
            ## no_bldgs = row[8]
            ## applicant_name = row[9]
            ## no_stories = row[10]
            ## no_units = row[11]

            ## sqft = row[7]
            ## number_of_buildings = row[8]
            ## applicant_name = row[9]
            ## number_of_stories = row[10]
            ## number_of_units = row[11]

            #check if this is one we want to skip
            if conversions.has_key(address.upper()):
                address = conversions[address.upper()]

            ## if (not status in ['EXPIRED', 'CLOSED']) and (permit_type in ['RENTAL']):

            #make sure it's not one we're skipping:
            if not address:
                print "SKIPPING ITEM: %s" % row[1]
                skips += 1
            else:
                #check if we've started processing any results for this row
                if locations.has_key(address.upper()):
                    location = locations[address.upper()]
                else:
                    location = Location()

            #temporarily just want to look at google again
            #location.sources = ["google"]
            #location.sources = ["google", "bing"]
            #location.sources = ["google", "bing", "usgeo", "geonames", "openmq"]
            #skip geocoding for columbia
            location.sources = []

            #do some geocoding, as needed:
            search = "%s, %s, %s" % (address.upper(), city_name, city.state)

            any_updated = False
            for geo_source in location.sources:
                update = geo.lookup(search, geo_source, location, force=True)
                #update = geo.lookup(search, geo_source, location, force=False)
                if update:
                    any_updated = True

            location.sources = [
                'csv', "google", "bing", "usgeo", "geonames", "openmq", "mq"
            ]

            #manually add data from csv here:
            result = []
            result.append({'place': address, 'lat': lat, 'lng': lng})
            setattr(location, 'csv', result)

            #this is the case for brand new searches
            #(which are updated in a different sense)
            if not hasattr(location,
                           "address_alt") or not location.address_alt:
                any_updated = True

            location.address_alt = search
            #location.bldg_units = bldg_units
            #location.units_bdrms = units_bdrms
            locations[address.upper()] = location

            #handle the database storage
            bldg = make_building(location,
                                 bldg_id,
                                 city,
                                 feed_source,
                                 no_units=no_units,
                                 bldg_type=bldg_type)

            if apt_main:
                unit = make_unit(apt_main, bldg)

            (person, bldg_person) = make_person(owner_name,
                                                bldg,
                                                "Agent",
                                                address=owner_address)

            if any_updated:
                #back it up for later
                #enable this when downloading GPS coordinates...
                #the rest of the time it slows things down
                local_cache['buildings'] = {}
                for key, value in locations.items():
                    local_cache['buildings'][key] = value.to_dict()
                save_json(cache_destination, local_cache)

            print

            #exit()

    destination = '%s.tsv' % city_tag
    save_results(locations, destination)
Пример #22
0
# Convert class labels to categorical data/one-hot encoding
y_test = to_categorical(y_test)
y_train = to_categorical(y_train)

model = create_lenet()

# Train model and evaluate training
results = model.fit(X_train, y_train, epochs=10, batch_size=64)
#training_eval(results, 'final')

# Predict and evaluate performance
y_fit = model.predict(X_test, batch_size=128)
performance_eval('final', y_fit.argmax(axis=1), y_test.argmax(axis=1))

save_json(model, 'final')
model.save_weights('models/final_weights.h5')

# Plot the problems
mis_index = np.where(y_fit.argmax(axis=1) != y_test.argmax(axis=1))
misclassifieds = X_test[mis_index]
predicted_labels = y_fit.argmax(axis=1)[mis_index]
target_labels = y_test.argmax(axis=1)[mis_index]
print('MNIST misclassifieds - predicted labels')
print(np.resize(predicted_labels, 10 * 10).reshape((10, 10)))
print('\nMNIST misclassifieds - target labels')
print(np.resize(target_labels, 10 * 10).reshape((10, 10)))

plot_samples(misclassifieds.reshape(np.shape(mis_index)[1], 28, 28),
             title='MNIST_misclassifieds',
             width=10,
Пример #23
0
def read_csv(source_csv, city_name, city_tag):
    city_options = City.objects.filter(tag=city_tag)
    print "Number of cities available: %s" % len(city_options)
    if not len(city_options):
        raise ValueError, "CITY NOT FOUND! run make_cities.py first"
        ## city = City()
        ## city.name = city_name
        ## city.tag = to_tag(city.name)
        ## city.save()
    else:
        city = city_options[0]

    print city

    feed_date = "2013-10-16"

    feeds = FeedInfo.objects.filter(city=city).filter(added=feed_date)
    if feeds.exists():
        feed = feeds[0]
        print "Already had feed: %s, %s" % (feed.city, feed.added)
    else:
        feed = FeedInfo()
        feed.city = city
        feed.added = feed_date
        feed.version = "0.1"
        feed.save()
        print "Created new feed: %s" % feed.city.name

    people = Person.objects.filter(name="Blank")
    if people.exists():
        person = people[0]
        print "Already had person: %s" % (person.name)
    else:
        person = Person()
        person.name = "Blank"
        person.save()
        print "Created new person: %s" % person.name

    sources = Source.objects.filter(feed=feed)
    if sources.exists():
        feed_source = sources[0]
        print "Already had source: %s, %s" % (feed_source.feed.city, feed_source.feed.added)
    else:
        feed_source = Source()
        feed_source.feed = feed
        feed_source.person = person
        feed_source.save()
        print "Created new source: %s" % feed_source.feed.city.name

    cache_file = "%s-20150525.json" % city.tag
    cache_destination = os.path.join(os.path.dirname(source_csv), cache_file)
    # keep a local copy of data we've processed...
    # this should help with subsequent calls
    # to make sure we don't need to duplicate calls to remote geolocation APIs:
    local_cache = load_json(cache_destination, create=True)
    if not local_cache.has_key("buildings"):
        local_cache["buildings"] = {}

    search_results = {}
    for key, value in local_cache["buildings"].items():
        # search_results[key] = Location(value)
        sr = SearchResults()
        sr.from_dict(value)
        # print
        # print sr
        # print
        search_results[key] = sr

    # geocoder helper:
    # geo = Geo()

    skips = 0
    with open(source_csv) as csvfile:

        reader = unicode_csv_reader(csvfile)

        # just print the first row:
        print ">, <".join(reader.next())

        count = 0

        # want to randomize the order... distribute options more evenly
        # print len(reader)
        # exit()
        # in order to randomize, should randomize the order in the csv
        for row in reader:
            count += 1
            print "Looking at row: %s" % count

            any_updated = False

            # could exit out early here, if needed
            if count > 10:
                # exit()
                pass

            # if you want to skip ahead more quickly:
            if count < 27187:
                pass
            else:

                # print row
                objectid = row[0]

                ## no_units = row[12]

                # can pass this in as bldg_id to make_building
                # that gets used for parcel too
                parcel_id = row[1]
                bldg_id = parcel_id

                street_num = row[2]
                street_dir = row[3]
                street_name = row[4]
                street_sfx = row[5]
                # eg building number
                qualifier_pre = row[6]
                # eg "UNIT" or "APT"
                qualifier_post = row[7]
                apt_num = row[8]
                # skip row9 (in/out... whatever that means)
                zip_code = row[10]
                # skip row11, assessor id
                # skip row12, address num
                # skip row13, x
                # skip row14, y
                # xcoord == lng
                lng = row[15]
                lat = row[16]

                # entry floor number: (named 'z' in sheet)
                floor = row[17]

                # skip row18, strcid... not sure
                # skip row19, parent
                # skip row20, app_
                # skip row21, hteloc
                zone = row[22]
                bldg_type = row[23]
                # number of buildings
                bldg_num = row[24]
                no_units = row[25]

                # skip row[26], inspection type
                # skip row27, app number
                # skip row28, date received
                # skip row29, application type
                # skip row30, ownerid
                # skip row31, operator id
                # skip row32, agent_id
                # skip row33, mail to
                central_heat = row[34]
                if central_heat == "Y":
                    central_heat = True
                else:
                    central_heat = False

                # heat mechanism? heat mechanic??? not sure
                heat_mech = row[35]
                # skip row36, agent id (2)
                # skip row37, agent last name
                # skip row38 agent first name
                # skip row39 agent middle initial
                # skip row40, agent title
                # skip row41, business name

                # could be owner, could be agent
                owner_name = row[42]
                owner_address1 = row[43]
                owner_address2 = row[44]
                owner_city = row[45]
                owner_state = row[46]
                owner_zip = row[47]

                # address = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre, qualifier_post, apt_num])

                # this is causing problems with lookups in google
                if (
                    qualifier_pre == "DUP"
                    or qualifier_pre == "DUPE"
                    or qualifier_pre == "2-Jan"
                    or qualifier_pre == "HM"
                    or qualifier_pre == "DWN"
                ):
                    qualifier_pre = ""

                address_main = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre])
                address_main = address_main.strip()
                # get rid of any double spaces
                address_main = address_main.replace("  ", " ")

                # similar to conversions,
                # but there are too many of these to list there
                if re.search("HOLLY RIDGE LN", address_main):
                    address_main = address_main.replace("HOLLY RIDGE LN", "HOLLYRIDGE LN")
                if re.search("BERKSHIRE CT", address_main):
                    address_main = address_main.replace("BERKSHIRE CT", "BERKSHIRE")
                    # address_main = ''
                if re.search("CAMERON CT", address_main):
                    address_main = address_main.replace("CAMERON CT", "CAMERON")
                    # address_main = ''
                if re.search("ATHENS CT", address_main):
                    address_main = address_main.replace("ATHENS CT", "ATHENS")
                    # address_main = ''
                if re.search("LAMAR CT", address_main):
                    address_main = address_main.replace("LAMAR CT", "LAMAR")
                    # address_main = ''
                if re.search("MONITEAU CT", address_main):
                    address_main = address_main.replace("MONITEAU CT", "MONITEAU")
                    # address_main = ''
                if re.search("IMPERIAL CT", address_main):
                    address_main = ""
                if re.search("PERKINS DR", address_main):
                    address_main = ""
                if re.search("GRANITE OAKS CT", address_main):
                    address_main = ""

                # sometimes the 'BLDG' data is added in the wrong place
                # then it gets treated as a unit item
                # (but it's not *always* a unit item, so can't generalize it that way)
                if qualifier_post == "BLDG" or qualifier_post == "LOT":
                    address_main = " ".join([address_main, qualifier_post, apt_main])
                    address_main = address_main.strip()
                    apt_main = ""
                else:
                    apt_main = " ".join([qualifier_post, apt_num])
                    apt_main = apt_main.strip()

                # check if this is one we want to skip
                if conversions.has_key(address_main.upper()):
                    address_main = conversions[address_main.upper()]

                if address_main:
                    print "APT_MAIN: ", apt_main
                    address = ", ".join([address_main, apt_main])
                else:
                    address = ""

                owner_address = ", ".join([owner_address1, owner_address2, owner_city, owner_state, owner_zip])

                ## if (not status in ['EXPIRED', 'CLOSED']) and (permit_type in ['RENTAL']):

                print "Parcel ID:", parcel_id
                print address

                results = None

                # make sure it's not one we're skipping:
                if not address:
                    print "SKIPPING ITEM: %s" % row[1]
                    skips += 1

                    skipf = codecs.open("skips.txt", "a", encoding="utf-8")
                    original = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre])
                    skipf.write(original)
                    skipf.write("\n")
                    skipf.close()

                else:
                    # check if we've started processing any results for this row
                    if search_results.has_key(address.upper()):
                        print "Already had building: %s" % address
                        results = search_results[address.upper()]
                        # print results
                    else:

                        addy = ", ".join([address_main, city.name, city.state])
                        addy += " " + zip_code
                        # addy += ", USA"
                        print addy

                        # toggle betweeen an actual google query
                        results = address_search(addy, apt_main)

                        # print dir(results)

                        if len(results.matches) > 1:
                            print results
                            for option in results.matches:
                                print "%s: %s, %s" % (option["place"], option["lat"], option["lng"])
                            print
                            print "Source Lat: %s, Lng: %s" % (lat, lng)
                            src_lat = int(float(lat) * 100)
                            src_lng = int(float(lng) * 100)

                            matched = False
                            for current in results.matches:
                                # current = results.matches[0]
                                print current["lat"]
                                print current["lng"]
                                # only want to look at the first 2 decimal places:
                                comp_lat = int(float(current["lat"]) * 100)
                                comp_lng = int(float(current["lng"]) * 100)
                                print comp_lat
                                print comp_lng

                                if (src_lat == comp_lat) and (src_lng == comp_lng):
                                    # results.matches = results.matches[:1]
                                    results.matches = [current]
                                    matched = True

                            if not matched:
                                print "DIDN'T MATCH!"
                                exit()

                        any_updated = True

                        # or just using results as specified in csv
                        # (THIS DOES NOT NORMALIZE THE ADDRESS VIA GOOGLE)
                        # results = SearchResults()
                        # results.unit_text = apt_main
                        # handle_place(results, addy, lat, lng, apt_main)

                    assert results
                    # print results

                    lookup_building_with_geo(results, make=True, parcel_id=parcel_id)
                    # print results
                    # current['results'] = results

                    # print results

                    if results.errors:
                        print results
                        raise ValueError, results.errors
                    else:

                        search_results[address.upper()] = results

                        bldg = results.building
                        assert bldg
                        unit = results.unit

                        # may be a case where the unit is blank
                        # and another unit with an number/letter was created earlier
                        # in that case, we won't be creating one here
                        # and the building will already exist...
                        # not necessarily an error though
                        # just redundant data
                        # assert unit

                        (person, bldg_person) = make_person(owner_name, bldg, "Agent", address=owner_address)

                    # time.sleep(1)

            if any_updated:
                # back it up for later
                # enable this when downloading GPS coordinates...
                # the rest of the time it slows things down
                local_cache["buildings"] = {}
                for key, value in search_results.items():
                    # search_results[key] = SearchResults().from_dict(value)
                    local_cache["buildings"][key] = value.to_dict()
                save_json(cache_destination, local_cache)

            print
Пример #24
0
    else:
        
        location = Location()

        #temporarily just want to look at google again
        location.sources = ["google"]

        #do some geocoding, as needed:
        search = "%s %s" % (city_name, city_state)

        any_updated = False
        for geo_source in location.sources:
            update = geo.lookup(search, geo_source, location, force=True)
            if update:
                any_updated = True

            result = location.get_source(geo_source)
            print len(result)
            print result
            city.latitude = result[0]['lat']
            city.longitude = result[0]['lng']

        location.sources = ["google", "bing", "usgeo", "geonames", "openmq", "mq"]
        
        saved_cities[city_tag] = {"name":city.name, "state":city.state, "tag":city.tag, "lat":city.latitude, "lng":city.longitude}

        save_json(cache_destination, saved_cities)
    
    city.save()

for building in buildings:
    if not building.tag in previous:
        previous[building.tag] = [ building.id ]
    else:
        print "%s previous buildings found with tag: %s" % (len(previous[building.tag]), building.tag)
        first_building = Building.objects.get(id=previous[building.tag][0])
        #print "%s units associated" % (dir(building.units))
        print "%s units associated" % (len(building.units.all()))
        for unit in building.units.all():
            print unit.tag
            unit.building = first_building
            unit.save()

        previous[building.tag].append(building.id)
        #TODO
        #delete this building now
        building.delete()

        
    count += 1


## keys = range(1496, 1512)
## for key in keys:
##     b = Building.objects.get(id=key)
##     print b

#print json.dumps(previous)
save_json("building_dupes.json", previous)
print "saving to: building_dupes.json"
Пример #26
0
def read_csv(source_csv, city_name, city_tag):
    city_options = City.objects.filter(tag=city_tag)
    print "Number of cities available: %s" % len(city_options)
    if not len(city_options):
        raise ValueError, "CITY NOT FOUND! run make_cities.py first"
        ## city = City()
        ## city.name = city_name
        ## city.tag = to_tag(city.name)
        ## city.save()
    else:
        city = city_options[0]

    print city

    feed_date = "2013-10-16"

    feeds = FeedInfo.objects.filter(city=city).filter(added=feed_date)
    if feeds.exists():
        feed = feeds[0]
        print "Already had feed: %s, %s" % (feed.city, feed.added)
    else:
        feed = FeedInfo()
        feed.city = city
        feed.added = feed_date
        feed.version = "0.1"
        feed.save()
        print "Created new feed: %s" % feed.city.name

    people = Person.objects.filter(name="Blank")
    if people.exists():
        person = people[0]
        print "Already had person: %s" % (person.name)
    else:
        person = Person()
        person.name = "Blank"
        person.save()
        print "Created new person: %s" % person.name

    sources = Source.objects.filter(feed=feed)
    if sources.exists():
        feed_source = sources[0]
        print "Already had source: %s, %s" % (feed_source.feed.city, feed_source.feed.added)
    else:
        feed_source = Source()
        feed_source.feed = feed
        feed_source.person = person
        feed_source.save()
        print "Created new source: %s" % feed_source.feed.city.name


    cache_file = "%s.json" % city.tag
    cache_destination = os.path.join(os.path.dirname(source_csv), cache_file)
    #keep a local copy of data we've processed...
    #this should help with subsequent calls
    #to make sure we don't need to duplicate calls to remote geolocation APIs:
    local_cache = load_json(cache_destination, create=True)
    if not local_cache.has_key('buildings'):
        local_cache['buildings'] = {}
    if not local_cache.has_key('parcels'):
        local_cache['parcels'] = {}
    
    locations = {}
    for key, value in local_cache['buildings'].items():
        locations[key] = Location(value)

    #geocoder helper:
    geo = Geo()

    skips = 0
    #with codecs.open(source_csv, 'rb', encoding='utf-8') as csvfile:
    with open(source_csv) as csvfile:
        #reader = csv.reader(csvfile, delimiter=' ', quotechar='|')
        #reader = csv.reader(csvfile)
        #reader = unicodecsv.UnicodeReader(csvfile, encoding='utf-8')

        reader = unicode_csv_reader(csvfile)

        #just print the first row:
        print '>, <'.join(reader.next())

        count = 0

        #want to randomize the order... distribute options more evenly
        #print len(reader)
        #exit()
        #in order to randomize, should randomize the order in the csv
        for row in reader:
            count += 1
            print "Looking at row: %s" % count
            
            #could exit out early here, if needed
            if count > 10:
                #exit()
                pass

            print row
            address = row[0]


            ## no_units = row[12]


            #can pass this in as bldg_id to make_building
            #that gets used for parcel too
            parcel_id = row[1]
            bldg_id = parcel_id

            street_num = row[2]
            street_dir = row[3]
            street_name = row[4]
            street_sfx = row[5]
            #eg building number
            qualifier_pre = row[6]
            #eg "UNIT" or "APT"
            qualifier_post = row[7]
            apt_num = row[8]
            #skip row9 (in/out... whatever that means)
            zip_code = row[10]
            #skip row11, assessor id
            #skip row12, address num
            #skip row13, x
            #skip row14, y
            #xcoord == lng
            lng = row[15]
            lat = row[16]

            #entry floor number: (named 'z' in sheet)
            floor = row[17]

            #skip row18, strcid... not sure
            #skip row19, parent
            #skip row20, app_
            #skip row21, hteloc
            zone = row[22]
            bldg_type = row[23]
            #number of buildings
            bldg_num = row[24]
            no_units = row[25]

            #skip row[26], inspection type
            #skip row27, app number
            #skip row28, date received
            #skip row29, application type
            #skip row30, ownerid
            #skip row31, operator id
            #skip row32, agent_id
            #skip row33, mail to
            central_heat = row[34]
            if central_heat == 'Y':
                central_heat = True
            else:
                central_heat = False

            #heat mechanism? heat mechanic??? not sure
            heat_mech = row[35]
            #skip row36, agent id (2)
            #skip row37, agent last name
            #skip row38 agent first name
            #skip row39 agent middle initial
            #skip row40, agent title
            #skip row41, business name

            #could be owner, could be agent
            owner_name = row[42]
            owner_address1 = row[43]
            owner_address2 = row[44]
            owner_city = row[45]
            owner_state = row[46]
            owner_zip = row[47]

            
            #address = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre, qualifier_post, apt_num])

            address_main = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre])
            address_main = address_main.strip()
            #get rid of any double spaces
            address_main = address_main.replace("  ", " ")
            
            apt_main = " ".join([qualifier_post, apt_num])
            apt_main = apt_main.strip()

            address = address_main
            print address

            owner_address = ", ".join([owner_address1, owner_address2, owner_city, owner_state, owner_zip])
            
            ## #should always be "RENTAL" (don't need to track this one)
            ## permit_type = row[1]
            ## if not permit_type == "RENTAL" and not permit_type == "MECHANICAL":
            ##     raise ValueError, "Unexpected permit type: %s in row: %s" % (
            ##         permit_type, row)
            
            ## bldg_type = row[2]
            
            ## #can use this to filter out non-rental or obsolete entries
            ## #don't need to track otherwise:
            ## status = row[3]
            ## parcel_id = row[4]

            ## #should be fixed per source:
            ## ss_city = row[6]

            ## bldg_sf = row[7]
            ## no_bldgs = row[8]
            ## applicant_name = row[9]
            ## no_stories = row[10]
            ## no_units = row[11]

            ## sqft = row[7]
            ## number_of_buildings = row[8]
            ## applicant_name = row[9]
            ## number_of_stories = row[10]
            ## number_of_units = row[11]
            
            #check if this is one we want to skip
            if conversions.has_key(address.upper()):
                address = conversions[address.upper()]

            ## if (not status in ['EXPIRED', 'CLOSED']) and (permit_type in ['RENTAL']):

            #make sure it's not one we're skipping:
            if not address:
                print "SKIPPING ITEM: %s" % row[1]
                skips += 1
            else:
                #check if we've started processing any results for this row
                if locations.has_key(address.upper()):
                    location = locations[address.upper()]
                else:
                    location = Location()

            #temporarily just want to look at google again
            #location.sources = ["google"]
            #location.sources = ["google", "bing"]
            #location.sources = ["google", "bing", "usgeo", "geonames", "openmq"]
            #skip geocoding for columbia
            location.sources = []
            
            #do some geocoding, as needed:
            search = "%s, %s, %s" % (address.upper(), city_name, city.state)

            any_updated = False
            for geo_source in location.sources:
                update = geo.lookup(search, geo_source, location, force=True)
                #update = geo.lookup(search, geo_source, location, force=False)
                if update:
                    any_updated = True

            location.sources = ['csv', "google", "bing", "usgeo", "geonames", "openmq", "mq"]

            #manually add data from csv here:
            result = []
            result.append({'place': address, 'lat': lat, 'lng': lng})
            setattr(location, 'csv', result)

            #this is the case for brand new searches
            #(which are updated in a different sense)
            if not hasattr(location, "address_alt") or not location.address_alt:
                any_updated = True

            location.address_alt = search
            #location.bldg_units = bldg_units
            #location.units_bdrms = units_bdrms
            locations[address.upper()] = location

            #handle the database storage
            bldg = make_building(location, bldg_id, city, feed_source, no_units=no_units, bldg_type=bldg_type)

            if apt_main:
                unit = make_unit(apt_main, bldg)

            (person, bldg_person) = make_person(owner_name, bldg, "Agent", address=owner_address)


            if any_updated:
                #back it up for later
                #enable this when downloading GPS coordinates...
                #the rest of the time it slows things down
                local_cache['buildings'] = {}
                for key, value in locations.items():
                    local_cache['buildings'][key] = value.to_dict()
                save_json(cache_destination, local_cache)

            print

            #exit()
            
    destination = '%s.tsv' % city_tag
    save_results(locations, destination)