def method_new(name="Untitled Q-Method", owner="Your Name", email="email", phone='phone', notes=''): #look for existing methods: options = os.listdir(data_path) new_option = "" #make sure that: #a) we have a new id and #b) the new id has not already been used while (not new_option) or (new_option in options): new_option = generate_id() #make new directory in method_path method_path = os.path.join(data_path, new_option) if not os.path.exists(method_path): os.makedirs(method_path) else: #This should never happen with above while loop, but just in case... raise ValueError, "Path exists, but it shouldn't: %s" % method_path #make an empty configuration file config = os.path.join(method_path, "config.json") result = load_json(config, create=True) result['name'] = name result['owner'] = owner result['email'] = email result['phone'] = phone result['notes'] = notes result['statements'] = """1. First sample statement 2. Second sample statement""" result['columns'] = '2 3 5 6 8 6 5 3 2' save_json(config, result) #redirect to the new method's page: redirect("/method/" + new_option + "/bookmark/")
def skf_cross_validate(model, X, y): """ wrapper function to do sklearn style stratified k fold cross validation on a keras model. Some code borrowed from: https://medium.com/@literallywords/stratified-k-fold-with-keras-e57c487b1416 input - X, training data y, training data labels """ print("Stratified K Fold cross Validating") skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True) for index, (train_index, test_index) in enumerate(skf.split(X, y)): print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] # Convert class labels to categorical data/one-hot encoding y_test = to_categorical(y_test) y_train = to_categorical(y_train) print ('Training '+'lenet-cv-'+str(index)) # Train model and validate results = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64) training_eval(results, 'lenet-cv-'+str(index)) save_json(model, 'lenet-cv-'+str(index)) model.save_weights('models/lenet-cv-'+str(index)+'_weights.h5')
def update_json(source, city_tag): cache_file = "%s.json" % city_tag cache_destination = os.path.join(os.path.dirname(source), cache_file) local_cache = load_json(cache_destination, create=True) assert local_cache.has_key('buildings') assert local_cache.has_key('parcels') locations = {} for key, value in local_cache['buildings'].items(): location = Location(value) for source in location.sources: if hasattr(location, source): result = getattr(location, source) #convert from old dict format here if isinstance(result, dict): print "Found dictionary in: %s for: %s" % (source, location.address) result = [ result ] setattr(location, source, result) locations[key] = location #back it up for later #enable this when downloading GPS coordinates... #the rest of the time it slows things down local_cache['buildings'] = {} for key, value in locations.items(): local_cache['buildings'][key] = value.to_dict() save_json(cache_destination, local_cache)
def update_json(source, city_tag): cache_file = "%s.json" % city_tag cache_destination = os.path.join(os.path.dirname(source), cache_file) local_cache = load_json(cache_destination, create=True) assert local_cache.has_key('buildings') assert local_cache.has_key('parcels') locations = {} for key, value in local_cache['buildings'].items(): location = Location(value) for source in location.sources: if hasattr(location, source): result = getattr(location, source) #convert from old dict format here if isinstance(result, dict): print "Found dictionary in: %s for: %s" % ( source, location.address) result = [result] setattr(location, source, result) locations[key] = location #back it up for later #enable this when downloading GPS coordinates... #the rest of the time it slows things down local_cache['buildings'] = {} for key, value in locations.items(): local_cache['buildings'][key] = value.to_dict() save_json(cache_destination, local_cache)
def subject_new(key): """ create a new subject for the Q-Method specified by key """ method_path = os.path.join(data_path, key) if not os.path.exists(method_path): return template('404', key=key, item="method") else: #look for existing subjects: options = os.listdir(method_path) new_option = "" #make sure that: #a) we have a new id and #b) the new id has not already been used while (not new_option) or (new_option in options): new_option = generate_id() #make new directory in method_path subject_path = os.path.join(method_path, new_option) if not os.path.exists(subject_path): os.makedirs(subject_path) else: #This should not ever happen with above check, but just in case... raise ValueError, "Subject path exists, but it shouldn't: %s" % subject_path #make an empty configuration file config = os.path.join(subject_path, "subject_config.json") result = load_json(config, create=True) #once the subject starts sorting, we will cache this locally #based on the current state of the method configuration #result['statements'] = "" result['columns'] = u"" result['json'] = u"" result['started'] = u"" #a textual representation of where each statement is result['state'] = u"" result['history'] = u"" #is it finished? complete? this will prevent further changes: result['locked'] = False #now: now = datetime.now() result['created'] = now.strftime("%Y.%m.%d %H:%M:%S") # after first movement result['started'] = u"" result['last_update'] = u"" save_json(config, result) #redirect to the new method's page: redirect("/method/" + key + "/")
def post_subject_json(mkey=None, skey=None): method_path = os.path.join(data_path, mkey) if not os.path.exists(method_path): return template('404', key=mkey, item="method") else: subject_path = os.path.join(method_path, skey) if not os.path.exists(subject_path): return template('404', key=skey, item="subject") else: method_config = os.path.join(method_path, "config.json") #result = load_json(config) method_json_file = codecs.open(method_config, 'r', encoding='utf-8', errors='ignore') method_json = method_json_file.read() method_details = json.loads(method_json) subject_config = os.path.join(subject_path, "subject_config.json") subject_json_file = codecs.open(subject_config, 'r', encoding='utf-8', errors='ignore') subject_json = subject_json_file.read() subject_details = json.loads(subject_json) #subject_data = load_json(subject_config) #changed = False now = datetime.now() if not subject_details['columns']: subject_details['columns'] = method_details['columns'] #changed = True #if request.forms.get('json') != subject_details['json']: #this includes the whole tree from javascript: # available, placed, and soon responses subject_details['json'] = request.forms.get('json') #changed = True if not subject_details['started']: subject_details['started'] = now.strftime("%Y.%m.%d %H:%M:%S") subject_details['last_update'] = now.strftime("%Y.%m.%d %H:%M:%S") #subject_details['history'] += u"%s,%s\n" % (now.strftime("%Y.%m.%d %H:%M:%S"), request.forms.get('action')) #subject_details['history'] += unicode(now.strftime("%Y.%m.%d %H:%M:%S")) + u"," + unicode(request.forms.get('action')) #this avoids: # UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 99: ordinal not in range(128) subject_details['history'] += unicode(now.strftime("%Y.%m.%d %H:%M:%S")) + u"," + request.forms.get('action').decode("utf-8") + u"\n" save_json(subject_config, subject_details) return template('success')
def save_results(cache_destination, local_cache): #destination = '%s.tsv' % city_tag #save_results(locations, destination) #convert all results to json serializable for_saving = {} for key in local_cache.keys(): current = local_cache[key] results = current['results'] dupe = copy.copy(current) dupe['results'] = results.to_dict() for_saving[key] = dupe save_json(cache_destination, for_saving)
def extract(self): """Actually run the extract process.""" data = OrderedDict() for game_type in VALID_GAME_TYPES: click.echo(game_type) data[game_type] = OrderedDict() self._extract_radio_calls(data[game_type], game_type) self._extract_throwables(data[game_type], game_type) self._extract_weapons(data[game_type], game_type) self._extract_equipment(data[game_type], game_type) data[game_type] = OrderedDict(sorted(data[game_type].items(), key=lambda k: k[0])) helpers.save_json(app.config['UNLOCKABLES_DATA_FILE'], data)
def extract(self): """Actually run the extract process.""" maps_paths = [] maps_paths.extend(glob(os.path.join(self.packages_dir, '*', 'maps', '*', 'objects.svg'))) # Maps in RWR game directory maps_paths.extend(glob(os.path.join(self.workshop_dir, '*', 'media', 'packages', '*', 'maps', '*', 'objects.svg'))) # Maps in RWR workshop directory data = OrderedDict() for map_path in maps_paths: server_type, map_id = utils.parse_map_path(map_path.replace('\\', '/').replace('/objects.svg', '')) if not map_id or map_id in INVALID_MAPS or server_type in INVALID_GAME_TYPES: click.secho('Invalid map ID ({}) or server type ({})'.format(map_id, server_type), fg='yellow') continue map_xml = etree.parse(map_path) map_infos = map_xml.findtext('//svg:rect[@inkscape:label=\'#general\']/svg:desc', namespaces={'svg': 'http://www.w3.org/2000/svg', 'inkscape': 'http://www.inkscape.org/namespaces/inkscape'}) if not map_infos: click.secho('No general map info found', fg='yellow') continue map_infos = self._parse_map_data(map_infos) if 'name' not in map_infos: click.secho('Map name not found', fg='yellow') continue click.echo(server_type + ':' + map_id) if server_type not in data: data[server_type] = OrderedDict() data[server_type][map_id] = OrderedDict([ ('name', map_infos['name'].replace('Pacific: ', '').title()), ('has_minimap', os.path.isfile(os.path.join(app.config['MINIMAPS_IMAGES_DIR'], server_type, map_id + '.png'))), ('has_preview', os.path.isfile(os.path.join(app.config['MAPS_PREVIEW_IMAGES_DIR'], server_type, map_id + '.png'))) ]) helpers.save_json(app.config['MAPS_DATA_FILE'], data)
def post_method_json(key=None): #print dir(request.forms) #print request.forms.keys() method_path = os.path.join(data_path, key) if not os.path.exists(method_path): return template('404', key=key, item="method") else: config = os.path.join(method_path, "config.json") result = load_json(config) changed = False for key in request.forms.keys(): #special case for 'statements' key... #want to get rid of any extra newline characters #this will help calculate the number of statements more accurately #(rather than stripping newlines everywhere we look at statements) # #this works here, but it will make it difficult to provide #feedback to the user about how many statements there are #compared to how many spaces there are available in columns #adding a similar check in method.js if key == "statements": text = request.forms.get(key) lines = text.splitlines() new_lines = [] for line in lines: if line: new_lines.append(line) value = '\n'.join(new_lines) else: value = request.forms.get(key) if value != result[key]: #print "%s (original) != %s (new)" % (result[key], request.forms.get(key)) result[key] = value changed = True if changed: #print "METHOD CONFIG CHANGED!!!! (saving)" save_json(config, result) return template('success')
def extract(self): """Actually run the extract process.""" # Only handle official ranks ranks_files_paths = [ { # In Vanilla, ranks from all factions are the same, inspired from the US Army 'country': 'us', 'path': os.path.join(self.packages_dir, 'vanilla', 'factions', 'brown.xml'), 'game_type': 'vanilla' }, { # In Pacific, US factions are the same as the Vanilla ones, so only parse IJA ranks 'country': 'jp', 'path': os.path.join(self.packages_dir, 'pacific', 'factions', 'ija.xml'), 'game_type': 'pacific' } ] data = OrderedDict() for ranks_file_path in ranks_files_paths: click.echo(ranks_file_path['country']) data[ranks_file_path['country']] = OrderedDict() faction_xml = etree.parse(ranks_file_path['path']) faction_xml_root = faction_xml.getroot() i = 0 for rank_node in faction_xml_root.iterchildren('rank'): rank_name = rank_node.get('name') click.echo(rank_name) data[ranks_file_path['country']][i] = OrderedDict([ ('name', rank_name), ('xp', int(float(rank_node.get('xp')) * 10000)) ]) self._extract_images(i, ranks_file_path['game_type'], ranks_file_path['country'], rank_node.find('hud_icon').get('filename')) i += 1 helpers.save_json(app.config['RANKS_DATA_FILE'], data)
def read_csv(source): #for reading unicode #f = codecs.open(source, 'r', encoding='utf-8') city_options = City.objects.filter(tag="ann_arbor") print len(city_options) if not len(city_options): city = City() city.name = "Ann Arbor" city.tag = to_tag(city.name) city.save() else: city = city_options[0] print city #TODO: #setup FeedInfo item #and also create a Source item permit_sub_types = [] status_types = [] building_nums = [] applicants = [] managers = [] cache_file = "%s.json" % city.tag cache_destination = os.path.join(os.path.dirname(source), cache_file) #keep a local copy of data we've processed... #this should help with subsequent calls #to make sure we don't need to duplicate calls to remote geolocation APIs: local_cache = load_json(cache_destination, create=True) if not local_cache.has_key('buildings'): local_cache['buildings'] = {} if not local_cache.has_key('parcels'): local_cache['parcels'] = {} locations = {} for key, value in local_cache['buildings'].items(): locations[key] = Location(value) #geocoder helper: geo = Geo() #with open('eggs.csv', 'rb') as csvfile: with codecs.open(source, 'rb', encoding='utf-8') as csvfile: #reader = csv.reader(csvfile, delimiter=' ', quotechar='|') reader = csv.reader(csvfile) #just print the first row: print '>, <'.join(reader.next()) count = 0 for row in reader: count += 1 #could exit out early here, if needed if count > 10: pass print row #type of building (eg: sf attached, duplex, etc) permit_id = row[0] #should always be "RENTAL" (don't need to track this one) permit_type = row[1] if not permit_type == "RENTAL" and not permit_type == "MECHANICAL": raise ValueError, "Unexpected permit type: %s in row: %s" % ( permit_type, row) sub_type = row[2] #can use this to filter out non-rental or obsolete entries #don't need to track otherwise: status = row[3] parcel_id = row[4] address = row[5] #should be fixed per source: city = row[6] if not ((city.lower() == 'ann arbor') or (city == '')): raise ValueError, "Unexpected city: %s" % (city) sqft = row[7] number_of_buildings = row[8] applicant_name = row[9] number_of_stories = row[10] number_of_units = row[11] if (not status in ['EXPIRED', 'CLOSED']) and (permit_type in ['RENTAL']): #check if we've started processing any results for this row #if local_cache['buildings'].has_key(address.upper()): # local_cache_cur = local_cache['buildings'][address.upper()] #else: # local_cache_cur = {} if locations.has_key(address.upper()): location = locations[address.upper()] else: location = Location() #do some geocoding, as needed: search = "%s, Ann Arbor MI" % address.upper() for source in location.sources: geo.lookup(search, source, location) location.address_alt = search locations[address.upper()] = location #local_cache['buildings'][address.upper()] = local_cache_cur #and check if a previous building object in the db exists #CREATE A NEW BUILDING OBJECT HERE #cur_building = Building() bldg = Building() bldg.type = sub_type #back it up for later local_cache['buildings'] = {} for key, value in locations.items(): local_cache['buildings'][key] = value.to_dict() save_json(cache_destination, local_cache) #exit() #THE FOLLOWING ARE FOR INFORMATIONAL PURPOSES ONLY #(to see what data is available) if not status in status_types: #print "adding: %s" % sub_type status_types.append(status) if not sub_type in permit_sub_types: #print "adding: %s" % sub_type permit_sub_types.append(sub_type) building_num = row[8] if not building_num in building_nums: #print "adding: %s" % sub_type building_nums.append(building_num) applicant = row[9] if (re.search('MGMT', applicant) or re.search('REALTY', applicant) or re.search('PROPERTIES', applicant) or re.search('MANAGEMENT', applicant) or re.search('GROUP', applicant) or re.search('LLC', applicant) or re.search('L.L.C.', applicant) or re.search('INC', applicant)): if not applicant in managers: managers.append(applicant) else: if not applicant in applicants: applicants.append(applicant) #print ', '.join(row) #print ## print permit_sub_types print status_types print building_nums save_results(locations)
def read_csv(source_csv, city_name, city_tag): city_options = City.objects.filter(tag=city_tag) print "Number of cities available: %s" % len(city_options) if not len(city_options): raise ValueError, "CITY NOT FOUND! run make_cities.py first" ## city = City() ## city.name = city_name ## city.tag = to_tag(city.name) ## city.save() else: city = city_options[0] print city feed_date = "2013-07-31" feeds = FeedInfo.objects.filter(city=city).filter(added=feed_date) if feeds.exists(): feed = feeds[0] print "Already had feed: %s, %s" % (feed.city, feed.added) else: feed = FeedInfo() feed.city = city feed.added = feed_date feed.version = "0.1" feed.save() print "Created new feed: %s" % feed.city.name people = Person.objects.filter(name="Blank") if people.exists(): person = people[0] print "Already had person: %s" % (person.name) else: person = Person() person.name = "Blank" person.save() print "Created new person: %s" % person.name sources = Source.objects.filter(feed=feed) if sources.exists(): feed_source = sources[0] print "Already had source: %s, %s" % (feed_source.feed.city, feed_source.feed.added) else: feed_source = Source() feed_source.feed = feed feed_source.person = person feed_source.save() print "Created new source: %s" % feed_source.feed.city.name cache_file = "%s.json" % city.tag cache_destination = os.path.join(os.path.dirname(source_csv), cache_file) #keep a local copy of data we've processed... #this should help with subsequent calls #to make sure we don't need to duplicate calls to remote geolocation APIs: local_cache = load_json(cache_destination, create=True) if not local_cache.has_key('buildings'): local_cache['buildings'] = {} if not local_cache.has_key('parcels'): local_cache['parcels'] = {} locations = {} for key, value in local_cache['buildings'].items(): locations[key] = Location(value) #geocoder helper: geo = Geo() skips = 0 #with codecs.open(source_csv, 'rb', encoding='utf-8') as csvfile: with open(source_csv) as csvfile: #reader = csv.reader(csvfile, delimiter=' ', quotechar='|') #reader = csv.reader(csvfile) #reader = unicodecsv.UnicodeReader(csvfile, encoding='utf-8') reader = unicode_csv_reader(csvfile) #just print the first row: print '>, <'.join(reader.next()) count = 0 for row in reader: count += 1 print "Looking at row: %s" % count #could exit out early here, if needed if count > 1000: #exit() pass address = row[0] #need to fix the number being at the end of the address parts = address.split(',') anumber = parts[-1] parts = parts[:-1] street = ",".join(parts) address = "%s %s" % (anumber, street) invoice_number = row[1] bldg_id = row[1] print bldg_id #this is where owner is stored invoice_note = row[6] print invoice_note if re.match('Sent to:', invoice_note): print "changing invoice note from: %s" % invoice_note invoice_note = invoice_note[8:] print "to: %s" % invoice_note else: #raise ValueError, "invoice note does not start with Sent to" print "!!!!!invoice note does not start with Sent to!!!!!" print "" print "" no_units = row[12] ## #should always be "RENTAL" (don't need to track this one) ## permit_type = row[1] ## if not permit_type == "RENTAL" and not permit_type == "MECHANICAL": ## raise ValueError, "Unexpected permit type: %s in row: %s" % ( ## permit_type, row) ## bldg_type = row[2] ## #can use this to filter out non-rental or obsolete entries ## #don't need to track otherwise: ## status = row[3] ## parcel_id = row[4] ## #should be fixed per source: ## ss_city = row[6] ## bldg_sf = row[7] ## no_bldgs = row[8] ## applicant_name = row[9] ## no_stories = row[10] ## no_units = row[11] ## if not ( (ss_city.lower() == city_name.lower()) or (ss_city == '') ): ## raise ValueError, "Unexpected city: %s" % (ss_city) ## sqft = row[7] ## number_of_buildings = row[8] ## applicant_name = row[9] ## number_of_stories = row[10] ## number_of_units = row[11] #check if this is one we want to skip if conversions.has_key(address.upper()): address = conversions[address.upper()] ## if (not status in ['EXPIRED', 'CLOSED']) and (permit_type in ['RENTAL']): #make sure it's not one we're skipping: if not address: print "SKIPPING ITEM: %s" % row[1] skips += 1 else: #check if we've started processing any results for this row if locations.has_key(address.upper()): location = locations[address.upper()] else: location = Location() #temporarily just want to look at google again #location.sources = ["google"] #location.sources = ["google", "bing", "usgeo", "geonames", "openmq"] location.sources = ["google", "bing"] #do some geocoding, as needed: search = "%s, %s, %s" % (address.upper(), city_name, city.state) any_updated = False for geo_source in location.sources: update = geo.lookup(search, geo_source, location, force=True) #update = geo.lookup(search, geo_source, location, force=False) if update: any_updated = True location.sources = ["google", "bing", "usgeo", "geonames", "openmq", "mq"] #this is the case for brand new searches #(which are updated in a different sense) if not hasattr(location, "address_alt") or not location.address_alt: any_updated = True location.address_alt = search #location.bldg_units = bldg_units #location.units_bdrms = units_bdrms locations[address.upper()] = location #handle the database storage bldg = make_building(location, bldg_id, city, feed_source, no_units=no_units) if invoice_note: (person, bldg_person) = make_person(invoice_note, bldg, "Permit Applicant") if any_updated: #back it up for later #enable this when downloading GPS coordinates... #the rest of the time it slows things down local_cache['buildings'] = {} for key, value in locations.items(): local_cache['buildings'][key] = value.to_dict() save_json(cache_destination, local_cache) print destination = '%s.tsv' % city_tag save_results(locations, destination)
def read_csv(source): #for reading unicode #f = codecs.open(source, 'r', encoding='utf-8') city_options = City.objects.filter(tag="ann_arbor") print len(city_options) if not len(city_options): city = City() city.name = "Ann Arbor" city.tag = to_tag(city.name) city.save() else: city = city_options[0] print city #TODO: #setup FeedInfo item #and also create a Source item permit_sub_types = [] status_types = [] building_nums = [] applicants = [] managers = [] cache_file = "%s.json" % city.tag cache_destination = os.path.join(os.path.dirname(source), cache_file) #keep a local copy of data we've processed... #this should help with subsequent calls #to make sure we don't need to duplicate calls to remote geolocation APIs: local_cache = load_json(cache_destination, create=True) if not local_cache.has_key('buildings'): local_cache['buildings'] = {} if not local_cache.has_key('parcels'): local_cache['parcels'] = {} locations = {} for key, value in local_cache['buildings'].items(): locations[key] = Location(value) #geocoder helper: geo = Geo() #with open('eggs.csv', 'rb') as csvfile: with codecs.open(source, 'rb', encoding='utf-8') as csvfile: #reader = csv.reader(csvfile, delimiter=' ', quotechar='|') reader = csv.reader(csvfile) #just print the first row: print '>, <'.join(reader.next()) count = 0 for row in reader: count += 1 #could exit out early here, if needed if count > 10: pass print row #type of building (eg: sf attached, duplex, etc) permit_id = row[0] #should always be "RENTAL" (don't need to track this one) permit_type = row[1] if not permit_type == "RENTAL" and not permit_type == "MECHANICAL": raise ValueError, "Unexpected permit type: %s in row: %s" % ( permit_type, row) sub_type = row[2] #can use this to filter out non-rental or obsolete entries #don't need to track otherwise: status = row[3] parcel_id = row[4] address = row[5] #should be fixed per source: city = row[6] if not ( (city.lower() == 'ann arbor') or (city == '') ): raise ValueError, "Unexpected city: %s" % (city) sqft = row[7] number_of_buildings = row[8] applicant_name = row[9] number_of_stories = row[10] number_of_units = row[11] if (not status in ['EXPIRED', 'CLOSED']) and (permit_type in ['RENTAL']): #check if we've started processing any results for this row #if local_cache['buildings'].has_key(address.upper()): # local_cache_cur = local_cache['buildings'][address.upper()] #else: # local_cache_cur = {} if locations.has_key(address.upper()): location = locations[address.upper()] else: location = Location() #do some geocoding, as needed: search = "%s, Ann Arbor MI" % address.upper() for source in location.sources: geo.lookup(search, source, location) location.address_alt = search locations[address.upper()] = location #local_cache['buildings'][address.upper()] = local_cache_cur #and check if a previous building object in the db exists #CREATE A NEW BUILDING OBJECT HERE #cur_building = Building() bldg = Building() bldg.type = sub_type #back it up for later local_cache['buildings'] = {} for key, value in locations.items(): local_cache['buildings'][key] = value.to_dict() save_json(cache_destination, local_cache) #exit() #THE FOLLOWING ARE FOR INFORMATIONAL PURPOSES ONLY #(to see what data is available) if not status in status_types: #print "adding: %s" % sub_type status_types.append(status) if not sub_type in permit_sub_types: #print "adding: %s" % sub_type permit_sub_types.append(sub_type) building_num = row[8] if not building_num in building_nums: #print "adding: %s" % sub_type building_nums.append(building_num) applicant = row[9] if ( re.search('MGMT', applicant) or re.search('REALTY', applicant) or re.search('PROPERTIES', applicant) or re.search('MANAGEMENT', applicant) or re.search('GROUP', applicant) or re.search('LLC', applicant) or re.search('L.L.C.', applicant) or re.search('INC', applicant) ): if not applicant in managers: managers.append(applicant) else: if not applicant in applicants: applicants.append(applicant) #print ', '.join(row) #print ## print permit_sub_types print status_types print building_nums save_results(locations)
def read_csv(source_csv, city_name, city_tag, driver): city_options = City.objects.filter(tag=city_tag) print "Number of cities available: %s" % len(city_options) if not len(city_options): raise ValueError, "CITY NOT FOUND! run make_cities.py first" ## city = City() ## city.name = city_name ## city.tag = to_tag(city.name) ## city.save() else: city = city_options[0] print city position_file = "position.json" position = load_json(position_file, create=True) if not position: position = 0 cache_file = "%s-20150525.json.bkup" % city.tag cache_destination = os.path.join(os.path.dirname(source_csv), cache_file) #keep a local copy of data we've processed... #this should help with subsequent calls #to make sure we don't need to duplicate calls to remote geolocation APIs: local_cache = load_json(cache_destination, create=True) if not local_cache.has_key('buildings'): local_cache['buildings'] = {} search_results = {} for key, value in local_cache['buildings'].items(): #search_results[key] = Location(value) sr = SearchResults() sr.from_dict(value) #print #print sr #print search_results[key] = sr #geocoder helper: #geo = Geo() provider = '' provider_options = ServiceProvider.objects.filter(name='City of Columbia') if len(provider_options): provider = provider_options[0] else: raise ValueError, "error finding utility_provider: %s matches" % len(provider_options) skips = 0 with open(source_csv) as csvfile: reader = unicode_csv_reader(csvfile) #just print the first row: print '>, <'.join(reader.next()) count = 0 #want to randomize the order... distribute options more evenly #print len(reader) #exit() #in order to randomize, should randomize the order in the csv for row in reader: count += 1 print "Looking at row: %s, position: %s" % (count, position) start = datetime.now() print "Started: ", start any_updated = False #could exit out early here, if needed if count > 10: #exit() pass #if you want to skip ahead more quickly: #if count < 0: if count < position: pass else: #print row objectid = row[0] ## no_units = row[12] #can pass this in as bldg_id to make_building #that gets used for parcel too parcel_id = row[1] bldg_id = parcel_id street_num = row[2] street_dir = row[3] street_name = row[4] street_sfx = row[5] #eg building number qualifier_pre = row[6] #eg "UNIT" or "APT" qualifier_post = row[7] apt_num = row[8] #skip row9 (in/out... whatever that means) zip_code = row[10] #skip row11, assessor id #skip row12, address num #skip row13, x #skip row14, y #xcoord == lng lng = row[15] lat = row[16] #entry floor number: (named 'z' in sheet) floor = row[17] #skip row18, strcid... not sure #skip row19, parent #skip row20, app_ #skip row21, hteloc zone = row[22] bldg_type = row[23] #number of buildings bldg_num = row[24] no_units = row[25] #skip row[26], inspection type #skip row27, app number #skip row28, date received #skip row29, application type #skip row30, ownerid #skip row31, operator id #skip row32, agent_id #skip row33, mail to central_heat = row[34] if central_heat == 'Y': central_heat = True else: central_heat = False #heat mechanism? heat mechanic??? not sure heat_mech = row[35] #skip row36, agent id (2) #skip row37, agent last name #skip row38 agent first name #skip row39 agent middle initial #skip row40, agent title #skip row41, business name #could be owner, could be agent ## owner_name = row[42] ## owner_address1 = row[43] ## owner_address2 = row[44] ## owner_city = row[45] ## owner_state = row[46] ## owner_zip = row[47] #address = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre, qualifier_post, apt_num]) #this is causing problems with lookups in google if qualifier_pre == "DUP" or qualifier_pre == "DUPE" or qualifier_pre == "2-Jan" or qualifier_pre == "HM" or qualifier_pre == "DWN": qualifier_pre = '' address_main = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre]) address_main = address_main.strip() #get rid of any double spaces address_main = address_main.replace(" ", " ") #similar to conversions, #but there are too many of these to list there if re.search('HOLLY RIDGE LN', address_main): address_main = address_main.replace('HOLLY RIDGE LN', 'HOLLYRIDGE LN') if re.search('BERKSHIRE CT', address_main): address_main = address_main.replace('BERKSHIRE CT', 'BERKSHIRE') #address_main = '' if re.search('CAMERON CT', address_main): address_main = address_main.replace('CAMERON CT', 'CAMERON') #address_main = '' if re.search('ATHENS CT', address_main): address_main = address_main.replace('ATHENS CT', 'ATHENS') #address_main = '' if re.search('LAMAR CT', address_main): address_main = address_main.replace('LAMAR CT', 'LAMAR') #address_main = '' if re.search('MONITEAU CT', address_main): address_main = address_main.replace('MONITEAU CT', 'MONITEAU') #address_main = '' if re.search('IMPERIAL CT', address_main): address_main = '' if re.search('PERKINS DR', address_main): address_main = '' if re.search('GRANITE OAKS CT', address_main): address_main = '' #sometimes the 'BLDG' data is added in the wrong place #then it gets treated as a unit item #(but it's not *always* a unit item, so can't generalize it that way) if qualifier_post == "BLDG" or qualifier_post == "LOT": address_main = " ".join([address_main, qualifier_post, apt_main]) address_main = address_main.strip() apt_main = '' else: apt_main = " ".join([qualifier_post, apt_num]) apt_main = apt_main.strip() #check if this is one we want to skip if conversions.has_key(address_main.upper()): address_main = conversions[address_main.upper()] if address_main: print "APT_MAIN: ", apt_main address = ", ".join( [address_main, apt_main] ) ## if (not status in ['EXPIRED', 'CLOSED']) and (permit_type in ['RENTAL']): print "Parcel ID:", parcel_id print address results = None #make sure it's not one we're skipping: if not address: print "SKIPPING ITEM: %s" % row[1] skips += 1 ## skips = codecs.open("skips.txt", 'a', encoding='utf-8') ## original = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre]) ## skips.write(original) ## skips.write('\n') ## skips.close() #check if we've started processing any results for this row elif not search_results.has_key(address.upper()): print "No saved search results for address: %s" % address print "Skipping." print #raise ValueError, "No results found for %s" % address else: print "Already had building: %s" % address results = search_results[address.upper()] assert results #print results lookup_building_with_geo(results, make=True, parcel_id=parcel_id) #print results #current['results'] = results #print results if results.errors: print results raise ValueError, results.errors else: bldg = results.building assert bldg unit = results.unit #at this point there should be at least one unit #and we will want to associate results with that unit #assert unit # can just pass this up in this case if not unit: print "Skipping address... no matching Unit!" else: #now that we have a building #look up energy data on the remote website #result = urllib2.urlopen("http://example.com/foo/bar") #print result.read() ## base = "http://www.gocolumbiamo.com/cfforms/ub/rental.html" ## driver.get(base) ## search = driver.find_element_by_css_selector('#address') ## search.send_keys(address) ## button = driver.find_element_by_css_selector('.ui-bar > a:nth-child(2)') ## #button = driver.find_element_by_css_selector('#PrimaryCenterColumn > div > div.ui-bar-b.ui-header > div > a.ui-btn.ui-btn-corner-all.ui-shadow.ui-btn-down-b.ui-btn-up-b') ## #button = driver.find_element_by_css_selector('#PrimaryCenterColumn > div > div.ui-bar-b.ui-header > div > a.ui-btn.ui-btn-corner-all.ui-shadow.ui-btn-down-b.ui-btn-up-b > span > span') ## button.click() ## time.sleep(4) ## #results = driver.find_element_by_css_selector('.dojoxGridMasterView') ## results = driver.find_element_by_css_selector('.dojoxGridContent > div:nth-child(1)') ## print results.get_attribute('innerHTML') ## print parcel_id ## options = results.find_elements_by_tag_name('div') ## #options = results.find_elements_by_link_text(parcel_id) ## print options ## #something didn't work with this: ## #look_for = '<td tabindex="-1" role="gridcell" colspan="1" class="dojoxGridCell" idx="0" style="width:90px;">%s</td>' % parcel_id ## look_for = '>%s<' % parcel_id ## matches = [] ## for option in options: ## markup = option.get_attribute('innerHTML') ## #print markup ## if re.search(look_for, markup): ## matches.append(option) ## #print "MATCH!" ## if len(matches) > 1: ## print matches ## raise ValueError, "Too many matches!" ## else: ## matches[0].click() #just realized that this form uses the property_id #which we already have... #can skip the steps above that are trying to make this link: base = "http://www.gocolumbiamo.com/cfforms/ub/ubdata.cfm?LOCID=%s&AppNum=79" % parcel_id driver.get(base) try: heat_source = driver.find_element_by_css_selector('#PrimaryCenterColumn > table:nth-child(1) > tbody:nth-child(1) > tr:nth-child(3) > td:nth-child(1) > strong:nth-child(1) > font:nth-child(1)') if heat_source.text.strip() == "Heating Source: Gas Heat": bldg.heat_source_details = 'gas' bldg.save() else: print heat_source.text exit() #TODO: bldg.heat_source_details = 'electric' bldg.who_pays_gas = 'not_available' except: print "heat source not found... skipping" try: selector = driver.find_element_by_css_selector('#el_table_length > label:nth-child(1) > select:nth-child(1) > option:nth-child(3)') selector.click() except: print "No Water data available... skipping" else: body = driver.find_element_by_css_selector('#el_table > tbody:nth-child(3)') rows = body.find_elements_by_tag_name('tr') #row = rows[0] query = bldg.utilitysummary_set.filter(type='electricity') for row in rows: #print row.get_attribute('innerHTML') cols = row.find_elements_by_tag_name('td') date = cols[0].text + '-01' cost = cols[1].text.replace('$', '').strip() amount = cols[2].text amount = amount.replace(' KWH', '') update_summary(query, date, cost, amount, bldg, unit, provider, 'electricity', 'kwh') #update_summary(query, date, cost, amount) #for item in cols: # print item.text #print dir(bldg) #print bldg.utilitysummary_set #query = bldg.utilitysummary_set.filter(type=utility_type[0]) #could look up type from UTILITY_TYPES... #but in this case we know what they should be #query = bldg.utilitysummary_set.filter(type='water') #if len(query): try: water = driver.find_element_by_css_selector('#ext-gen23') water.click() selector = driver.find_element_by_css_selector('#wr_table_length > label:nth-child(1) > select:nth-child(1) > option:nth-child(3)') selector.click() except: print "No Water data available... skipping" else: body = driver.find_element_by_css_selector('#wr_table > tbody:nth-child(3)') rows = body.find_elements_by_tag_name('tr') #row = rows[0] query = bldg.utilitysummary_set.filter(type='water') for row in rows: #print row.get_attribute('innerHTML') cols = row.find_elements_by_tag_name('td') date = cols[0].text + '-01' cost = cols[1].text.replace('$', '').strip() amount = cols[2].text amount = amount.replace(' CCF', '') update_summary(query, date, cost, amount, bldg, unit, provider, 'water', 'ccf') #update_summary(query, date, cost, amount) #for item in cols: # print item.text unit.update_averages() #see if we have enough info now to make a score: unit.update_energy_score() #now that we've saved the unit, #update the averages for the whole building: unit.building.update_utility_averages() unit.building.update_rent_details() position += 1 save_json(position_file, position) if any_updated: #back it up for later #enable this when downloading GPS coordinates... #the rest of the time it slows things down local_cache['buildings'] = {} for key, value in search_results.items(): #search_results[key] = SearchResults().from_dict(value) local_cache['buildings'][key] = value.to_dict() save_json(cache_destination, local_cache) position = count save_json(position_file, position) exit() end = datetime.now() print "finished: ", end total_time = end - start print total_time print
def read_csv(source_csv, city_name, city_tag): city_options = City.objects.filter(tag=city_tag) print "Number of cities available: %s" % len(city_options) if not len(city_options): raise ValueError, "CITY NOT FOUND! run make_cities.py first" ## city = City() ## city.name = city_name ## city.tag = to_tag(city.name) ## city.save() else: city = city_options[0] print city feed_date = "2013-07-31" feeds = FeedInfo.objects.filter(city=city).filter(added=feed_date) if feeds.exists(): feed = feeds[0] print "Already had feed: %s, %s" % (feed.city, feed.added) else: feed = FeedInfo() feed.city = city feed.added = feed_date feed.version = "0.1" feed.save() print "Created new feed: %s" % feed.city.name people = Person.objects.filter(name="Blank") if people.exists(): person = people[0] print "Already had person: %s" % (person.name) else: person = Person() person.name = "Blank" person.save() print "Created new person: %s" % person.name sources = Source.objects.filter(feed=feed) if sources.exists(): feed_source = sources[0] print "Already had source: %s, %s" % (feed_source.feed.city, feed_source.feed.added) else: feed_source = Source() feed_source.feed = feed feed_source.person = person feed_source.save() print "Created new source: %s" % feed_source.feed.city.name cache_file = "%s.json" % city.tag cache_destination = os.path.join(os.path.dirname(source_csv), cache_file) #keep a local copy of data we've processed... #this should help with subsequent calls #to make sure we don't need to duplicate calls to remote geolocation APIs: local_cache = load_json(cache_destination, create=True) if not local_cache.has_key('buildings'): local_cache['buildings'] = {} if not local_cache.has_key('parcels'): local_cache['parcels'] = {} locations = {} for key, value in local_cache['buildings'].items(): locations[key] = Location(value) #geocoder helper: geo = Geo() skips = 0 #with codecs.open(source_csv, 'rb', encoding='utf-8') as csvfile: with open(source_csv) as csvfile: #reader = csv.reader(csvfile, delimiter=' ', quotechar='|') #reader = csv.reader(csvfile) #reader = unicodecsv.UnicodeReader(csvfile, encoding='utf-8') reader = unicode_csv_reader(csvfile) #just print the first row: print '>, <'.join(reader.next()) count = 0 for row in reader: count += 1 print "Looking at row: %s" % count #could exit out early here, if needed if count > 1000: #exit() pass address = row[0] #need to fix the number being at the end of the address parts = address.split(',') anumber = parts[-1] parts = parts[:-1] street = ",".join(parts) address = "%s %s" % (anumber, street) invoice_number = row[1] bldg_id = row[1] print bldg_id #this is where owner is stored invoice_note = row[6] print invoice_note if re.match('Sent to:', invoice_note): print "changing invoice note from: %s" % invoice_note invoice_note = invoice_note[8:] print "to: %s" % invoice_note else: #raise ValueError, "invoice note does not start with Sent to" print "!!!!!invoice note does not start with Sent to!!!!!" print "" print "" no_units = row[12] ## #should always be "RENTAL" (don't need to track this one) ## permit_type = row[1] ## if not permit_type == "RENTAL" and not permit_type == "MECHANICAL": ## raise ValueError, "Unexpected permit type: %s in row: %s" % ( ## permit_type, row) ## bldg_type = row[2] ## #can use this to filter out non-rental or obsolete entries ## #don't need to track otherwise: ## status = row[3] ## parcel_id = row[4] ## #should be fixed per source: ## ss_city = row[6] ## bldg_sf = row[7] ## no_bldgs = row[8] ## applicant_name = row[9] ## no_stories = row[10] ## no_units = row[11] ## if not ( (ss_city.lower() == city_name.lower()) or (ss_city == '') ): ## raise ValueError, "Unexpected city: %s" % (ss_city) ## sqft = row[7] ## number_of_buildings = row[8] ## applicant_name = row[9] ## number_of_stories = row[10] ## number_of_units = row[11] #check if this is one we want to skip if conversions.has_key(address.upper()): address = conversions[address.upper()] ## if (not status in ['EXPIRED', 'CLOSED']) and (permit_type in ['RENTAL']): #make sure it's not one we're skipping: if not address: print "SKIPPING ITEM: %s" % row[1] skips += 1 else: #check if we've started processing any results for this row if locations.has_key(address.upper()): location = locations[address.upper()] else: location = Location() #temporarily just want to look at google again #location.sources = ["google"] #location.sources = ["google", "bing", "usgeo", "geonames", "openmq"] location.sources = ["google", "bing"] #do some geocoding, as needed: search = "%s, %s, %s" % (address.upper(), city_name, city.state) any_updated = False for geo_source in location.sources: update = geo.lookup(search, geo_source, location, force=True) #update = geo.lookup(search, geo_source, location, force=False) if update: any_updated = True location.sources = [ "google", "bing", "usgeo", "geonames", "openmq", "mq" ] #this is the case for brand new searches #(which are updated in a different sense) if not hasattr(location, "address_alt") or not location.address_alt: any_updated = True location.address_alt = search #location.bldg_units = bldg_units #location.units_bdrms = units_bdrms locations[address.upper()] = location #handle the database storage bldg = make_building(location, bldg_id, city, feed_source, no_units=no_units) if invoice_note: (person, bldg_person) = make_person(invoice_note, bldg, "Permit Applicant") if any_updated: #back it up for later #enable this when downloading GPS coordinates... #the rest of the time it slows things down local_cache['buildings'] = {} for key, value in locations.items(): local_cache['buildings'][key] = value.to_dict() save_json(cache_destination, local_cache) print destination = '%s.tsv' % city_tag save_results(locations, destination)
#do some geocoding, as needed: search = "%s %s" % (city_name, city_state) any_updated = False for geo_source in location.sources: update = geo.lookup(search, geo_source, location, force=True) if update: any_updated = True result = location.get_source(geo_source) print len(result) print result city.latitude = result[0]['lat'] city.longitude = result[0]['lng'] location.sources = [ "google", "bing", "usgeo", "geonames", "openmq", "mq" ] saved_cities[city_tag] = { "name": city.name, "state": city.state, "tag": city.tag, "lat": city.latitude, "lng": city.longitude } save_json(cache_destination, saved_cities) city.save()
y_train = to_categorical(y_train) # Reshape data for input to Dense layer X_train = X_train.reshape(-1, 28 * 28) X_test = X_test.reshape(-1, 28 * 28) # Create sequential 2-layer model model = Sequential() model.add(Dense(400, input_dim=28 * 28, activation='sigmoid')) model.add(Dense(10, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer=optimizers.SGD(), metrics=['accuracy']) # Write the summary to file save_summary(model, 'simple') # Train model and evaluate training results = model.fit(X_train.reshape(-1, 28 * 28), y_train, epochs=10, batch_size=64, validation_split=1 / 12) training_eval(results, 'simple') # Predict and evaluate performance y_fit = model.predict(X_test, batch_size=128) performance_eval('simple', y_fit.argmax(axis=1), y_test.argmax(axis=1)) save_json(model, 'simple') model.save_weights('models/simple_weights.h5')
def read_csv(source_csv): city_options = City.objects.filter(tag="bloomington_in") print "Number of cities available: %s" % len(city_options) if not len(city_options): raise ValueError, "CITY NOT FOUND! run make_cities.py first" ## city = City() ## city.name = "Bloomington" ## city.tag = to_tag(city.name) ## city.save() else: city = city_options[0] print city feed_date = "2013-08-29" feeds = FeedInfo.objects.filter(city=city).filter(added=feed_date) if feeds.exists(): feed = feeds[0] print "Already had feed: %s, %s" % (feed.city, feed.added) else: feed = FeedInfo() feed.city = city feed.added = feed_date feed.version = "0.1" feed.save() print "Created new feed: %s" % feed.city people = Person.objects.filter(name="Blank") if people.exists(): person = people[0] print "Already had person: %s" % (person.name) else: person = Person() person.name = "Blank" person.save() print "Created new person: %s" % person.name sources = Source.objects.filter(feed=feed) if sources.exists(): feed_source = sources[0] print "Already had source: %s, %s" % (feed_source.feed.city, feed_source.feed.added) else: feed_source = Source() feed_source.feed = feed feed_source.person = person feed_source.save() print "Created new source: %s" % feed_source.feed.city cache_file = "%s.json" % city.tag cache_destination = os.path.join(os.path.dirname(source_csv), cache_file) # keep a local copy of data we've processed... # this should help with subsequent calls # to make sure we don't need to duplicate calls to remote geolocation APIs: local_cache = load_json(cache_destination, create=True) if not local_cache.has_key("buildings"): local_cache["buildings"] = {} if not local_cache.has_key("parcels"): local_cache["parcels"] = {} locations = {} for key, value in local_cache["buildings"].items(): locations[key] = Location(value) # geocoder helper: geo = Geo() skips = 0 with codecs.open(source_csv, "rb", encoding="utf-8") as csvfile: # reader = csv.reader(csvfile, delimiter=' ', quotechar='|') reader = csv.reader(csvfile) # just print the first row: print ">, <".join(reader.next()) count = 0 for row in reader: count += 1 print "Looking at row: %s" % count # could exit out early here, if needed if count > 1000: # exit() pass bldg_id = row[0] print bldg_id address = row[1] print address owner = row[2] # skip this: ownder_contact = row[3] agent = row[4] bldg_units = row[9] print bldg_units units_bdrms = row[10] print units_bdrms # check if this is one we want to skip if conversions.has_key(address.upper()): address = conversions[address.upper()] # make sure it's not one we're skipping: if not address: print "SKIPPING ITEM: %s" % row[1] skips += 1 else: if locations.has_key(address.upper()): location = locations[address.upper()] else: location = Location() # temporarily just want to look at google again location.sources = ["google"] # do some geocoding, as needed: search = "%s, Bloomington IN" % address.upper() any_updated = False for geo_source in location.sources: update = geo.lookup(search, geo_source, location, force=True) if update: any_updated = True location.sources = ["google", "bing", "usgeo", "geonames", "openmq", "mq"] if not hasattr(location, "address_alt") or not location.address_alt: any_updated = True location.address_alt = search location.bldg_units = bldg_units location.units_bdrms = units_bdrms locations[address.upper()] = location # handle the database storage bldg = make_building(location, bldg_id, city, feed_source) # owner_details = parse_person(owner) if owner: result = special_cases(owner) if result: (owner_name, owner_address) = result else: (owner_name, owner_address, owner_phone, remainder) = parse_person(owner) ## print "owner name: %s" % owner_name ## print "owner address: %s" % owner_address ## print "" if owner_name: (person, bldg_person) = make_person(owner_name, bldg, "Owner", address=owner_address) if agent and agent != "No Agent": # agent_details = parse_person(agent) (agent_name, agent_address, agent_phone, remainder) = parse_person(agent) ## print "agent name: %s" % agent_name ## print "agent address: %s" % agent_address ## print "" if agent_name: (person, bldg_person) = make_person(agent_name, bldg, "Agent", address=agent_address, city=city) if any_updated: # back it up for later # enable this when downloading GPS coordinates... # the rest of the time it slows things down local_cache["buildings"] = {} for key, value in locations.items(): local_cache["buildings"][key] = value.to_dict() save_json(cache_destination, local_cache) print save_results(locations, "bloomington-filtered.tsv")
# FC6 Fully connected layer model.add(Dense(84, activation='tanh')) # Output layer model.add(Dense(10, activation='softmax')) # compile model model.compile(loss='categorical_crossentropy', optimizer=optimizers.SGD(), metrics=['accuracy']) # Write the summary to file save_summary(model, 'lenet') # Train model and evaluate training results = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=1 / 12) training_eval(results, 'lenet') # Predict and evaluate performance y_fit = model.predict(X_test, batch_size=128) performance_eval('lenet', y_fit.argmax(axis=1), y_test.argmax(axis=1)) save_json(model, 'lenet') model.save_weights('models/lenet_weights.h5')
def read_csv(source_csv, city_name, city_tag): city_options = City.objects.filter(tag=city_tag) print "Number of cities available: %s" % len(city_options) if not len(city_options): raise ValueError, "CITY NOT FOUND! run make_cities.py first" ## city = City() ## city.name = city_name ## city.tag = to_tag(city.name) ## city.save() else: city = city_options[0] print city feed_date = "2013-10-16" feeds = FeedInfo.objects.filter(city=city).filter(added=feed_date) if feeds.exists(): feed = feeds[0] print "Already had feed: %s, %s" % (feed.city, feed.added) else: feed = FeedInfo() feed.city = city feed.added = feed_date feed.version = "0.1" feed.save() print "Created new feed: %s" % feed.city.name people = Person.objects.filter(name="Blank") if people.exists(): person = people[0] print "Already had person: %s" % (person.name) else: person = Person() person.name = "Blank" person.save() print "Created new person: %s" % person.name sources = Source.objects.filter(feed=feed) if sources.exists(): feed_source = sources[0] print "Already had source: %s, %s" % (feed_source.feed.city, feed_source.feed.added) else: feed_source = Source() feed_source.feed = feed feed_source.person = person feed_source.save() print "Created new source: %s" % feed_source.feed.city.name cache_file = "%s.json" % city.tag cache_destination = os.path.join(os.path.dirname(source_csv), cache_file) #keep a local copy of data we've processed... #this should help with subsequent calls #to make sure we don't need to duplicate calls to remote geolocation APIs: local_cache = load_json(cache_destination, create=True) if not local_cache.has_key('buildings'): local_cache['buildings'] = {} if not local_cache.has_key('parcels'): local_cache['parcels'] = {} locations = {} for key, value in local_cache['buildings'].items(): locations[key] = Location(value) #geocoder helper: geo = Geo() skips = 0 #with codecs.open(source_csv, 'rb', encoding='utf-8') as csvfile: with open(source_csv) as csvfile: #reader = csv.reader(csvfile, delimiter=' ', quotechar='|') #reader = csv.reader(csvfile) #reader = unicodecsv.UnicodeReader(csvfile, encoding='utf-8') reader = unicode_csv_reader(csvfile) #just print the first row: print '>, <'.join(reader.next()) count = 0 #want to randomize the order... distribute options more evenly #print len(reader) #exit() #in order to randomize, should randomize the order in the csv for row in reader: count += 1 print "Looking at row: %s" % count #could exit out early here, if needed if count > 10: #exit() pass print row address = row[0] ## no_units = row[12] #can pass this in as bldg_id to make_building #that gets used for parcel too parcel_id = row[1] bldg_id = parcel_id street_num = row[2] street_dir = row[3] street_name = row[4] street_sfx = row[5] #eg building number qualifier_pre = row[6] #eg "UNIT" or "APT" qualifier_post = row[7] apt_num = row[8] #skip row9 (in/out... whatever that means) zip_code = row[10] #skip row11, assessor id #skip row12, address num #skip row13, x #skip row14, y #xcoord == lng lng = row[15] lat = row[16] #entry floor number: (named 'z' in sheet) floor = row[17] #skip row18, strcid... not sure #skip row19, parent #skip row20, app_ #skip row21, hteloc zone = row[22] bldg_type = row[23] #number of buildings bldg_num = row[24] no_units = row[25] #skip row[26], inspection type #skip row27, app number #skip row28, date received #skip row29, application type #skip row30, ownerid #skip row31, operator id #skip row32, agent_id #skip row33, mail to central_heat = row[34] if central_heat == 'Y': central_heat = True else: central_heat = False #heat mechanism? heat mechanic??? not sure heat_mech = row[35] #skip row36, agent id (2) #skip row37, agent last name #skip row38 agent first name #skip row39 agent middle initial #skip row40, agent title #skip row41, business name #could be owner, could be agent owner_name = row[42] owner_address1 = row[43] owner_address2 = row[44] owner_city = row[45] owner_state = row[46] owner_zip = row[47] #address = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre, qualifier_post, apt_num]) address_main = " ".join([ street_num, street_dir, street_name, street_sfx, qualifier_pre ]) address_main = address_main.strip() #get rid of any double spaces address_main = address_main.replace(" ", " ") apt_main = " ".join([qualifier_post, apt_num]) apt_main = apt_main.strip() address = address_main print address owner_address = ", ".join([ owner_address1, owner_address2, owner_city, owner_state, owner_zip ]) ## #should always be "RENTAL" (don't need to track this one) ## permit_type = row[1] ## if not permit_type == "RENTAL" and not permit_type == "MECHANICAL": ## raise ValueError, "Unexpected permit type: %s in row: %s" % ( ## permit_type, row) ## bldg_type = row[2] ## #can use this to filter out non-rental or obsolete entries ## #don't need to track otherwise: ## status = row[3] ## parcel_id = row[4] ## #should be fixed per source: ## ss_city = row[6] ## bldg_sf = row[7] ## no_bldgs = row[8] ## applicant_name = row[9] ## no_stories = row[10] ## no_units = row[11] ## sqft = row[7] ## number_of_buildings = row[8] ## applicant_name = row[9] ## number_of_stories = row[10] ## number_of_units = row[11] #check if this is one we want to skip if conversions.has_key(address.upper()): address = conversions[address.upper()] ## if (not status in ['EXPIRED', 'CLOSED']) and (permit_type in ['RENTAL']): #make sure it's not one we're skipping: if not address: print "SKIPPING ITEM: %s" % row[1] skips += 1 else: #check if we've started processing any results for this row if locations.has_key(address.upper()): location = locations[address.upper()] else: location = Location() #temporarily just want to look at google again #location.sources = ["google"] #location.sources = ["google", "bing"] #location.sources = ["google", "bing", "usgeo", "geonames", "openmq"] #skip geocoding for columbia location.sources = [] #do some geocoding, as needed: search = "%s, %s, %s" % (address.upper(), city_name, city.state) any_updated = False for geo_source in location.sources: update = geo.lookup(search, geo_source, location, force=True) #update = geo.lookup(search, geo_source, location, force=False) if update: any_updated = True location.sources = [ 'csv', "google", "bing", "usgeo", "geonames", "openmq", "mq" ] #manually add data from csv here: result = [] result.append({'place': address, 'lat': lat, 'lng': lng}) setattr(location, 'csv', result) #this is the case for brand new searches #(which are updated in a different sense) if not hasattr(location, "address_alt") or not location.address_alt: any_updated = True location.address_alt = search #location.bldg_units = bldg_units #location.units_bdrms = units_bdrms locations[address.upper()] = location #handle the database storage bldg = make_building(location, bldg_id, city, feed_source, no_units=no_units, bldg_type=bldg_type) if apt_main: unit = make_unit(apt_main, bldg) (person, bldg_person) = make_person(owner_name, bldg, "Agent", address=owner_address) if any_updated: #back it up for later #enable this when downloading GPS coordinates... #the rest of the time it slows things down local_cache['buildings'] = {} for key, value in locations.items(): local_cache['buildings'][key] = value.to_dict() save_json(cache_destination, local_cache) print #exit() destination = '%s.tsv' % city_tag save_results(locations, destination)
# Convert class labels to categorical data/one-hot encoding y_test = to_categorical(y_test) y_train = to_categorical(y_train) model = create_lenet() # Train model and evaluate training results = model.fit(X_train, y_train, epochs=10, batch_size=64) #training_eval(results, 'final') # Predict and evaluate performance y_fit = model.predict(X_test, batch_size=128) performance_eval('final', y_fit.argmax(axis=1), y_test.argmax(axis=1)) save_json(model, 'final') model.save_weights('models/final_weights.h5') # Plot the problems mis_index = np.where(y_fit.argmax(axis=1) != y_test.argmax(axis=1)) misclassifieds = X_test[mis_index] predicted_labels = y_fit.argmax(axis=1)[mis_index] target_labels = y_test.argmax(axis=1)[mis_index] print('MNIST misclassifieds - predicted labels') print(np.resize(predicted_labels, 10 * 10).reshape((10, 10))) print('\nMNIST misclassifieds - target labels') print(np.resize(target_labels, 10 * 10).reshape((10, 10))) plot_samples(misclassifieds.reshape(np.shape(mis_index)[1], 28, 28), title='MNIST_misclassifieds', width=10,
def read_csv(source_csv, city_name, city_tag): city_options = City.objects.filter(tag=city_tag) print "Number of cities available: %s" % len(city_options) if not len(city_options): raise ValueError, "CITY NOT FOUND! run make_cities.py first" ## city = City() ## city.name = city_name ## city.tag = to_tag(city.name) ## city.save() else: city = city_options[0] print city feed_date = "2013-10-16" feeds = FeedInfo.objects.filter(city=city).filter(added=feed_date) if feeds.exists(): feed = feeds[0] print "Already had feed: %s, %s" % (feed.city, feed.added) else: feed = FeedInfo() feed.city = city feed.added = feed_date feed.version = "0.1" feed.save() print "Created new feed: %s" % feed.city.name people = Person.objects.filter(name="Blank") if people.exists(): person = people[0] print "Already had person: %s" % (person.name) else: person = Person() person.name = "Blank" person.save() print "Created new person: %s" % person.name sources = Source.objects.filter(feed=feed) if sources.exists(): feed_source = sources[0] print "Already had source: %s, %s" % (feed_source.feed.city, feed_source.feed.added) else: feed_source = Source() feed_source.feed = feed feed_source.person = person feed_source.save() print "Created new source: %s" % feed_source.feed.city.name cache_file = "%s-20150525.json" % city.tag cache_destination = os.path.join(os.path.dirname(source_csv), cache_file) # keep a local copy of data we've processed... # this should help with subsequent calls # to make sure we don't need to duplicate calls to remote geolocation APIs: local_cache = load_json(cache_destination, create=True) if not local_cache.has_key("buildings"): local_cache["buildings"] = {} search_results = {} for key, value in local_cache["buildings"].items(): # search_results[key] = Location(value) sr = SearchResults() sr.from_dict(value) # print # print sr # print search_results[key] = sr # geocoder helper: # geo = Geo() skips = 0 with open(source_csv) as csvfile: reader = unicode_csv_reader(csvfile) # just print the first row: print ">, <".join(reader.next()) count = 0 # want to randomize the order... distribute options more evenly # print len(reader) # exit() # in order to randomize, should randomize the order in the csv for row in reader: count += 1 print "Looking at row: %s" % count any_updated = False # could exit out early here, if needed if count > 10: # exit() pass # if you want to skip ahead more quickly: if count < 27187: pass else: # print row objectid = row[0] ## no_units = row[12] # can pass this in as bldg_id to make_building # that gets used for parcel too parcel_id = row[1] bldg_id = parcel_id street_num = row[2] street_dir = row[3] street_name = row[4] street_sfx = row[5] # eg building number qualifier_pre = row[6] # eg "UNIT" or "APT" qualifier_post = row[7] apt_num = row[8] # skip row9 (in/out... whatever that means) zip_code = row[10] # skip row11, assessor id # skip row12, address num # skip row13, x # skip row14, y # xcoord == lng lng = row[15] lat = row[16] # entry floor number: (named 'z' in sheet) floor = row[17] # skip row18, strcid... not sure # skip row19, parent # skip row20, app_ # skip row21, hteloc zone = row[22] bldg_type = row[23] # number of buildings bldg_num = row[24] no_units = row[25] # skip row[26], inspection type # skip row27, app number # skip row28, date received # skip row29, application type # skip row30, ownerid # skip row31, operator id # skip row32, agent_id # skip row33, mail to central_heat = row[34] if central_heat == "Y": central_heat = True else: central_heat = False # heat mechanism? heat mechanic??? not sure heat_mech = row[35] # skip row36, agent id (2) # skip row37, agent last name # skip row38 agent first name # skip row39 agent middle initial # skip row40, agent title # skip row41, business name # could be owner, could be agent owner_name = row[42] owner_address1 = row[43] owner_address2 = row[44] owner_city = row[45] owner_state = row[46] owner_zip = row[47] # address = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre, qualifier_post, apt_num]) # this is causing problems with lookups in google if ( qualifier_pre == "DUP" or qualifier_pre == "DUPE" or qualifier_pre == "2-Jan" or qualifier_pre == "HM" or qualifier_pre == "DWN" ): qualifier_pre = "" address_main = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre]) address_main = address_main.strip() # get rid of any double spaces address_main = address_main.replace(" ", " ") # similar to conversions, # but there are too many of these to list there if re.search("HOLLY RIDGE LN", address_main): address_main = address_main.replace("HOLLY RIDGE LN", "HOLLYRIDGE LN") if re.search("BERKSHIRE CT", address_main): address_main = address_main.replace("BERKSHIRE CT", "BERKSHIRE") # address_main = '' if re.search("CAMERON CT", address_main): address_main = address_main.replace("CAMERON CT", "CAMERON") # address_main = '' if re.search("ATHENS CT", address_main): address_main = address_main.replace("ATHENS CT", "ATHENS") # address_main = '' if re.search("LAMAR CT", address_main): address_main = address_main.replace("LAMAR CT", "LAMAR") # address_main = '' if re.search("MONITEAU CT", address_main): address_main = address_main.replace("MONITEAU CT", "MONITEAU") # address_main = '' if re.search("IMPERIAL CT", address_main): address_main = "" if re.search("PERKINS DR", address_main): address_main = "" if re.search("GRANITE OAKS CT", address_main): address_main = "" # sometimes the 'BLDG' data is added in the wrong place # then it gets treated as a unit item # (but it's not *always* a unit item, so can't generalize it that way) if qualifier_post == "BLDG" or qualifier_post == "LOT": address_main = " ".join([address_main, qualifier_post, apt_main]) address_main = address_main.strip() apt_main = "" else: apt_main = " ".join([qualifier_post, apt_num]) apt_main = apt_main.strip() # check if this is one we want to skip if conversions.has_key(address_main.upper()): address_main = conversions[address_main.upper()] if address_main: print "APT_MAIN: ", apt_main address = ", ".join([address_main, apt_main]) else: address = "" owner_address = ", ".join([owner_address1, owner_address2, owner_city, owner_state, owner_zip]) ## if (not status in ['EXPIRED', 'CLOSED']) and (permit_type in ['RENTAL']): print "Parcel ID:", parcel_id print address results = None # make sure it's not one we're skipping: if not address: print "SKIPPING ITEM: %s" % row[1] skips += 1 skipf = codecs.open("skips.txt", "a", encoding="utf-8") original = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre]) skipf.write(original) skipf.write("\n") skipf.close() else: # check if we've started processing any results for this row if search_results.has_key(address.upper()): print "Already had building: %s" % address results = search_results[address.upper()] # print results else: addy = ", ".join([address_main, city.name, city.state]) addy += " " + zip_code # addy += ", USA" print addy # toggle betweeen an actual google query results = address_search(addy, apt_main) # print dir(results) if len(results.matches) > 1: print results for option in results.matches: print "%s: %s, %s" % (option["place"], option["lat"], option["lng"]) print print "Source Lat: %s, Lng: %s" % (lat, lng) src_lat = int(float(lat) * 100) src_lng = int(float(lng) * 100) matched = False for current in results.matches: # current = results.matches[0] print current["lat"] print current["lng"] # only want to look at the first 2 decimal places: comp_lat = int(float(current["lat"]) * 100) comp_lng = int(float(current["lng"]) * 100) print comp_lat print comp_lng if (src_lat == comp_lat) and (src_lng == comp_lng): # results.matches = results.matches[:1] results.matches = [current] matched = True if not matched: print "DIDN'T MATCH!" exit() any_updated = True # or just using results as specified in csv # (THIS DOES NOT NORMALIZE THE ADDRESS VIA GOOGLE) # results = SearchResults() # results.unit_text = apt_main # handle_place(results, addy, lat, lng, apt_main) assert results # print results lookup_building_with_geo(results, make=True, parcel_id=parcel_id) # print results # current['results'] = results # print results if results.errors: print results raise ValueError, results.errors else: search_results[address.upper()] = results bldg = results.building assert bldg unit = results.unit # may be a case where the unit is blank # and another unit with an number/letter was created earlier # in that case, we won't be creating one here # and the building will already exist... # not necessarily an error though # just redundant data # assert unit (person, bldg_person) = make_person(owner_name, bldg, "Agent", address=owner_address) # time.sleep(1) if any_updated: # back it up for later # enable this when downloading GPS coordinates... # the rest of the time it slows things down local_cache["buildings"] = {} for key, value in search_results.items(): # search_results[key] = SearchResults().from_dict(value) local_cache["buildings"][key] = value.to_dict() save_json(cache_destination, local_cache) print
else: location = Location() #temporarily just want to look at google again location.sources = ["google"] #do some geocoding, as needed: search = "%s %s" % (city_name, city_state) any_updated = False for geo_source in location.sources: update = geo.lookup(search, geo_source, location, force=True) if update: any_updated = True result = location.get_source(geo_source) print len(result) print result city.latitude = result[0]['lat'] city.longitude = result[0]['lng'] location.sources = ["google", "bing", "usgeo", "geonames", "openmq", "mq"] saved_cities[city_tag] = {"name":city.name, "state":city.state, "tag":city.tag, "lat":city.latitude, "lng":city.longitude} save_json(cache_destination, saved_cities) city.save()
for building in buildings: if not building.tag in previous: previous[building.tag] = [ building.id ] else: print "%s previous buildings found with tag: %s" % (len(previous[building.tag]), building.tag) first_building = Building.objects.get(id=previous[building.tag][0]) #print "%s units associated" % (dir(building.units)) print "%s units associated" % (len(building.units.all())) for unit in building.units.all(): print unit.tag unit.building = first_building unit.save() previous[building.tag].append(building.id) #TODO #delete this building now building.delete() count += 1 ## keys = range(1496, 1512) ## for key in keys: ## b = Building.objects.get(id=key) ## print b #print json.dumps(previous) save_json("building_dupes.json", previous) print "saving to: building_dupes.json"
def read_csv(source_csv, city_name, city_tag): city_options = City.objects.filter(tag=city_tag) print "Number of cities available: %s" % len(city_options) if not len(city_options): raise ValueError, "CITY NOT FOUND! run make_cities.py first" ## city = City() ## city.name = city_name ## city.tag = to_tag(city.name) ## city.save() else: city = city_options[0] print city feed_date = "2013-10-16" feeds = FeedInfo.objects.filter(city=city).filter(added=feed_date) if feeds.exists(): feed = feeds[0] print "Already had feed: %s, %s" % (feed.city, feed.added) else: feed = FeedInfo() feed.city = city feed.added = feed_date feed.version = "0.1" feed.save() print "Created new feed: %s" % feed.city.name people = Person.objects.filter(name="Blank") if people.exists(): person = people[0] print "Already had person: %s" % (person.name) else: person = Person() person.name = "Blank" person.save() print "Created new person: %s" % person.name sources = Source.objects.filter(feed=feed) if sources.exists(): feed_source = sources[0] print "Already had source: %s, %s" % (feed_source.feed.city, feed_source.feed.added) else: feed_source = Source() feed_source.feed = feed feed_source.person = person feed_source.save() print "Created new source: %s" % feed_source.feed.city.name cache_file = "%s.json" % city.tag cache_destination = os.path.join(os.path.dirname(source_csv), cache_file) #keep a local copy of data we've processed... #this should help with subsequent calls #to make sure we don't need to duplicate calls to remote geolocation APIs: local_cache = load_json(cache_destination, create=True) if not local_cache.has_key('buildings'): local_cache['buildings'] = {} if not local_cache.has_key('parcels'): local_cache['parcels'] = {} locations = {} for key, value in local_cache['buildings'].items(): locations[key] = Location(value) #geocoder helper: geo = Geo() skips = 0 #with codecs.open(source_csv, 'rb', encoding='utf-8') as csvfile: with open(source_csv) as csvfile: #reader = csv.reader(csvfile, delimiter=' ', quotechar='|') #reader = csv.reader(csvfile) #reader = unicodecsv.UnicodeReader(csvfile, encoding='utf-8') reader = unicode_csv_reader(csvfile) #just print the first row: print '>, <'.join(reader.next()) count = 0 #want to randomize the order... distribute options more evenly #print len(reader) #exit() #in order to randomize, should randomize the order in the csv for row in reader: count += 1 print "Looking at row: %s" % count #could exit out early here, if needed if count > 10: #exit() pass print row address = row[0] ## no_units = row[12] #can pass this in as bldg_id to make_building #that gets used for parcel too parcel_id = row[1] bldg_id = parcel_id street_num = row[2] street_dir = row[3] street_name = row[4] street_sfx = row[5] #eg building number qualifier_pre = row[6] #eg "UNIT" or "APT" qualifier_post = row[7] apt_num = row[8] #skip row9 (in/out... whatever that means) zip_code = row[10] #skip row11, assessor id #skip row12, address num #skip row13, x #skip row14, y #xcoord == lng lng = row[15] lat = row[16] #entry floor number: (named 'z' in sheet) floor = row[17] #skip row18, strcid... not sure #skip row19, parent #skip row20, app_ #skip row21, hteloc zone = row[22] bldg_type = row[23] #number of buildings bldg_num = row[24] no_units = row[25] #skip row[26], inspection type #skip row27, app number #skip row28, date received #skip row29, application type #skip row30, ownerid #skip row31, operator id #skip row32, agent_id #skip row33, mail to central_heat = row[34] if central_heat == 'Y': central_heat = True else: central_heat = False #heat mechanism? heat mechanic??? not sure heat_mech = row[35] #skip row36, agent id (2) #skip row37, agent last name #skip row38 agent first name #skip row39 agent middle initial #skip row40, agent title #skip row41, business name #could be owner, could be agent owner_name = row[42] owner_address1 = row[43] owner_address2 = row[44] owner_city = row[45] owner_state = row[46] owner_zip = row[47] #address = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre, qualifier_post, apt_num]) address_main = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre]) address_main = address_main.strip() #get rid of any double spaces address_main = address_main.replace(" ", " ") apt_main = " ".join([qualifier_post, apt_num]) apt_main = apt_main.strip() address = address_main print address owner_address = ", ".join([owner_address1, owner_address2, owner_city, owner_state, owner_zip]) ## #should always be "RENTAL" (don't need to track this one) ## permit_type = row[1] ## if not permit_type == "RENTAL" and not permit_type == "MECHANICAL": ## raise ValueError, "Unexpected permit type: %s in row: %s" % ( ## permit_type, row) ## bldg_type = row[2] ## #can use this to filter out non-rental or obsolete entries ## #don't need to track otherwise: ## status = row[3] ## parcel_id = row[4] ## #should be fixed per source: ## ss_city = row[6] ## bldg_sf = row[7] ## no_bldgs = row[8] ## applicant_name = row[9] ## no_stories = row[10] ## no_units = row[11] ## sqft = row[7] ## number_of_buildings = row[8] ## applicant_name = row[9] ## number_of_stories = row[10] ## number_of_units = row[11] #check if this is one we want to skip if conversions.has_key(address.upper()): address = conversions[address.upper()] ## if (not status in ['EXPIRED', 'CLOSED']) and (permit_type in ['RENTAL']): #make sure it's not one we're skipping: if not address: print "SKIPPING ITEM: %s" % row[1] skips += 1 else: #check if we've started processing any results for this row if locations.has_key(address.upper()): location = locations[address.upper()] else: location = Location() #temporarily just want to look at google again #location.sources = ["google"] #location.sources = ["google", "bing"] #location.sources = ["google", "bing", "usgeo", "geonames", "openmq"] #skip geocoding for columbia location.sources = [] #do some geocoding, as needed: search = "%s, %s, %s" % (address.upper(), city_name, city.state) any_updated = False for geo_source in location.sources: update = geo.lookup(search, geo_source, location, force=True) #update = geo.lookup(search, geo_source, location, force=False) if update: any_updated = True location.sources = ['csv', "google", "bing", "usgeo", "geonames", "openmq", "mq"] #manually add data from csv here: result = [] result.append({'place': address, 'lat': lat, 'lng': lng}) setattr(location, 'csv', result) #this is the case for brand new searches #(which are updated in a different sense) if not hasattr(location, "address_alt") or not location.address_alt: any_updated = True location.address_alt = search #location.bldg_units = bldg_units #location.units_bdrms = units_bdrms locations[address.upper()] = location #handle the database storage bldg = make_building(location, bldg_id, city, feed_source, no_units=no_units, bldg_type=bldg_type) if apt_main: unit = make_unit(apt_main, bldg) (person, bldg_person) = make_person(owner_name, bldg, "Agent", address=owner_address) if any_updated: #back it up for later #enable this when downloading GPS coordinates... #the rest of the time it slows things down local_cache['buildings'] = {} for key, value in locations.items(): local_cache['buildings'][key] = value.to_dict() save_json(cache_destination, local_cache) print #exit() destination = '%s.tsv' % city_tag save_results(locations, destination)