def method_new(name="Untitled Q-Method", owner="Your Name", email="email", phone='phone', notes=''): #look for existing methods: options = os.listdir(data_path) new_option = "" #make sure that: #a) we have a new id and #b) the new id has not already been used while (not new_option) or (new_option in options): new_option = generate_id() #make new directory in method_path method_path = os.path.join(data_path, new_option) if not os.path.exists(method_path): os.makedirs(method_path) else: #This should never happen with above while loop, but just in case... raise ValueError, "Path exists, but it shouldn't: %s" % method_path #make an empty configuration file config = os.path.join(method_path, "config.json") result = load_json(config, create=True) result['name'] = name result['owner'] = owner result['email'] = email result['phone'] = phone result['notes'] = notes result['statements'] = """1. First sample statement 2. Second sample statement""" result['columns'] = '2 3 5 6 8 6 5 3 2' save_json(config, result) #redirect to the new method's page: redirect("/method/" + new_option + "/bookmark/")
def get_champion_id_dict(SECRET_API_KEY=None): dct = helpers.load_json(".cache/champ_ids.json") if not dct or time.time() > dct["expiry_time"]: if not SECRET_API_KEY: SECRET_API_KEY = read_api_key() url = BASE + "/lol/static-data/v3/champions?locale=en_US&dataById=true" headers = {"X-Riot-Token": SECRET_API_KEY} r = requests.get(url, headers=headers) if r.status_code == 429: print(r.json()) t = r.json()["Retry-After"] print("Waiting " + str(t) + " seconds and trying again") print("Full response:") print(r.json()) time.sleep(t) return get_match_from_id(matchid, SECRET_API_KEY) if r.status_code != 200: print("Get champion id dict failed") return r dct = {} data = r.json()["data"] for champ_id in data: dct[champ_id] = data[champ_id]["name"] dct["expiry_time"] = time.time( ) + 60 * 60 * 24 * 30 # Expires after one month if not helpers.store_json(dct, ".cache/champ_ids.json", True): return None return dct else: return dct
def update_json(source, city_tag): cache_file = "%s.json" % city_tag cache_destination = os.path.join(os.path.dirname(source), cache_file) local_cache = load_json(cache_destination, create=True) assert local_cache.has_key('buildings') assert local_cache.has_key('parcels') locations = {} for key, value in local_cache['buildings'].items(): location = Location(value) for source in location.sources: if hasattr(location, source): result = getattr(location, source) #convert from old dict format here if isinstance(result, dict): print "Found dictionary in: %s for: %s" % ( source, location.address) result = [result] setattr(location, source, result) locations[key] = location #back it up for later #enable this when downloading GPS coordinates... #the rest of the time it slows things down local_cache['buildings'] = {} for key, value in locations.items(): local_cache['buildings'][key] = value.to_dict() save_json(cache_destination, local_cache)
def update_state_json(self, content): try: cstate = helpers.load_json( content)[ 'cluster'] self.update_state( cstate) except (ValueError, KeyError): return False return True
def update_state_json(self, content): try: cstate = helpers.load_json(content)['cluster'] self.update_state(cstate) except (ValueError, KeyError): return False return True
def get_account_id(summoner_name, SECRET_API_KEY=None): dct = helpers.load_json(".cache/summoners/" + summoner_name + ".json") if not dct or time.time() > dct["expiry_time"]: if not SECRET_API_KEY: SECRET_API_KEY = read_api_key() if not summoner_name: raise TypeError("Summoner name cannot be None") if not isinstance(summoner_name, str): raise TypeError("Summoner name must be a string") url = BASE + "/lol/summoner/v3/summoners/by-name/" + summoner_name headers = {"X-Riot-Token": SECRET_API_KEY} r = requests.get(url, headers=headers) if r.status_code == 429: print(r.json()) t = r.json()["Retry-After"] print("Waiting " + str(t) + " seconds and trying again") print("Full response:") print(r.json()) time.sleep(t) return get_match_from_id(matchid, SECRET_API_KEY) if r.status_code != 200: print("Get account id failed") return r response = r.json() response["expiry_time"] = time.time( ) + 60 * 60 * 24 * 30 # Expires after one month helpers.store_json(response, ".cache/summoners/" + summoner_name + ".json", True) return response["accountId"] else: return dct["accountId"]
def update_json(source, city_tag): cache_file = "%s.json" % city_tag cache_destination = os.path.join(os.path.dirname(source), cache_file) local_cache = load_json(cache_destination, create=True) assert local_cache.has_key('buildings') assert local_cache.has_key('parcels') locations = {} for key, value in local_cache['buildings'].items(): location = Location(value) for source in location.sources: if hasattr(location, source): result = getattr(location, source) #convert from old dict format here if isinstance(result, dict): print "Found dictionary in: %s for: %s" % (source, location.address) result = [ result ] setattr(location, source, result) locations[key] = location #back it up for later #enable this when downloading GPS coordinates... #the rest of the time it slows things down local_cache['buildings'] = {} for key, value in locations.items(): local_cache['buildings'][key] = value.to_dict() save_json(cache_destination, local_cache)
def get_match_from_id(matchid, SECRET_API_KEY=None): match = helpers.load_json(".match_cache/" + str(matchid) + ".json") if not match: if not SECRET_API_KEY: SECRET_API_KEY = read_api_key() if not matchid: raise TypeError("Match id cannot be None") if not isinstance(matchid, int): raise TypeError("Match id must be an int") url = BASE + "/lol/match/v3/matches/" + str(matchid) headers = {"X-Riot-Token": SECRET_API_KEY} r = requests.get(url, headers=headers) if r.status_code == 429: print("\n" * 5) print(r.json()) print("\n" * 5) try: t = r.json()["Retry-After"] except: t = 180 print("Waiting " + str(t) + " seconds and trying again") print("Full response:") print(r.json()) sys.stdout.flush() time.sleep(t) return get_match_from_id(matchid, SECRET_API_KEY) if r.status_code != 200: print("Get match object failed") return r match = r.json() helpers.store_json(match, ".match_cache/" + str(matchid) + ".json", True) return match else: return match
def check_resume_success( nlp, args, source_file, last_shard, output_path, split, compression ): logger.info("Checking if resume was successful...") chunk_file_path_str = split + "." + str(last_shard - 1) + ".json" if compression: chunk_file_path_str += ".gz" chunk_file_path = os.path.join(output_path, chunk_file_path_str) line_source = source_file.readline().strip() line_source_tokenized = next(tokenize(nlp, [line_source])) # Apply preprocessing on the line preprocessed_line = preprocess( line_source_tokenized, [1] * len(line_source_tokenized), args.min_sentence_ntokens, args.max_sentence_ntokens, args.min_example_nsents, args.max_example_nsents, )[0] try: chunk_json, _ = load_json(chunk_file_path) except FileNotFoundError: logger.error( "The file at path %s was not found. Make sure `--compression` is set correctly.", chunk_file_path, ) last_item_chunk = chunk_json[-1] line_chunk = last_item_chunk["src"] # remove the last item if it is a newline if line_chunk[-1] == ["\n"]: line_chunk.pop() if line_chunk == preprocessed_line: logger.info("Resume Successful!") logger.debug("`source_file` moved forward one line") else: logger.info("Resume NOT Successful") logger.info("Last Chunk Line: %s", line_chunk) logger.info("Previous (to resume line) Source Line: %s", preprocessed_line) # skipcq: PYL-W1201 logger.info( ( "Common causes of this issue:\n" + "1. You changed the `--shard_interval`. You used a different interval previously than you used in the command to resume.\n" + "2. The abstractive (`.source` and `.target`) or extractive (`.json`) dataset files were modified or removed. The last `.json` file needs to be in the same folder it was originally outputted to so the last shard index and be determined and the last line can be read.\n" + "3. It is entirely possible that there is a bug in this script. If you have checked that the above were not the cause and that there were no issues pertaining to your dataset then open an issue at https://github.com/HHousen/TransformerSum/issues/new." ) ) return False return True
def test_mingrate_pids(app, location, datadir): """Test migrate pids.""" data = load_json(datadir, 'cds_records_demo_1_project.json') dump = CDSRecordDump(data=data[0]) record = CDSRecordDumpLoader.create(dump=dump) pids = [pid.pid_value for pid in PersistentIdentifier.query.filter_by(object_uuid=record.id)] expected = sorted(['2093596', 'CERN-MOVIE-2012-193']) assert sorted(pids) == expected
def _load_from_file( filename): try: with open( filename, 'r') as f: content = helpers.load_json( f.read()) except (IOError, ValueError, KeyError) as e: _logger.warn( 'Cannot read file %s with json configuration %s', filename, e) return False r = _load_from_content( content) if r: _logger.info( 'Loading %s', filename) return r
def configure(): #look for existing methods: options = os.listdir(data_path) results = {} for option in options: method_path = os.path.join(data_path, option) config = os.path.join(method_path, "config.json") if os.path.exists(config): result = load_json(config) results[option] = result return template('configure', options=results)
def _load_from_file(filename): try: with open(filename, 'r') as f: content = helpers.load_json(f.read()) except (IOError, ValueError, KeyError) as e: _logger.warn('Cannot read file %s with json configuration %s', filename, e) return False r = _load_from_content(content) if r: _logger.info('Loading %s', filename) return r
def subject_new(key): """ create a new subject for the Q-Method specified by key """ method_path = os.path.join(data_path, key) if not os.path.exists(method_path): return template('404', key=key, item="method") else: #look for existing subjects: options = os.listdir(method_path) new_option = "" #make sure that: #a) we have a new id and #b) the new id has not already been used while (not new_option) or (new_option in options): new_option = generate_id() #make new directory in method_path subject_path = os.path.join(method_path, new_option) if not os.path.exists(subject_path): os.makedirs(subject_path) else: #This should not ever happen with above check, but just in case... raise ValueError, "Subject path exists, but it shouldn't: %s" % subject_path #make an empty configuration file config = os.path.join(subject_path, "subject_config.json") result = load_json(config, create=True) #once the subject starts sorting, we will cache this locally #based on the current state of the method configuration #result['statements'] = "" result['columns'] = u"" result['json'] = u"" result['started'] = u"" #a textual representation of where each statement is result['state'] = u"" result['history'] = u"" #is it finished? complete? this will prevent further changes: result['locked'] = False #now: now = datetime.now() result['created'] = now.strftime("%Y.%m.%d %H:%M:%S") # after first movement result['started'] = u"" result['last_update'] = u"" save_json(config, result) #redirect to the new method's page: redirect("/method/" + key + "/")
def main(): data = helpers.load_json("data/states.json") if not isinstance(data, dict): data = { x["full_name"]:x for x in data } key = "marriage_age" new = {} lines = helpers.read_lines("entry.txt") lines = [ x for x in lines if x ] # lines = lines[::4] for line in lines: line = line.split(". ")[-1] name, num = line.split(": ", 1) new[name] = float(num) try: name = line.split("\t")[0] name = name.split("(")[0].strip() new[name] = float(line.split("\t")[1].replace(",", "")) except Exception: pass [ print(k, ":", v) for k, v in new.items() ] for name, val in new.items(): if name not in data: data[name] = {} data[name][key] = val # Clean up the data cleaned = {} for k, v in data.items(): key = rmchars(k, ".") key = key.replace("Saint", "St") if key in cleaned: cleaned[key].update(v) else: cleaned[key] = v cleaned[key]["name"] = key return helpers.dump_json(cleaned, "foo.json")
def bulk_generation(): filenames_test=load_json('filenames_test_saifullah') path='../../../../Desktop/UAV/images/' num_test_images=len(filenames_test) # generated_captions=list() f=open('generated_captions_saifullah.txt','w') for i in range(num_test_images): if i==884: # image 884 (square_40) is corrupted C=generate_caption(path+filenames_test[883]) f.writelines(C+'\n') continue C=generate_caption(path+filenames_test[i]) # generated_captions.append(C) f.writelines(C+'\n') progress=100*i/num_test_images print(i, "Progress: %.2f" % progress) f.close()
def post_method_json(key=None): #print dir(request.forms) #print request.forms.keys() method_path = os.path.join(data_path, key) if not os.path.exists(method_path): return template('404', key=key, item="method") else: config = os.path.join(method_path, "config.json") result = load_json(config) changed = False for key in request.forms.keys(): #special case for 'statements' key... #want to get rid of any extra newline characters #this will help calculate the number of statements more accurately #(rather than stripping newlines everywhere we look at statements) # #this works here, but it will make it difficult to provide #feedback to the user about how many statements there are #compared to how many spaces there are available in columns #adding a similar check in method.js if key == "statements": text = request.forms.get(key) lines = text.splitlines() new_lines = [] for line in lines: if line: new_lines.append(line) value = '\n'.join(new_lines) else: value = request.forms.get(key) if value != result[key]: #print "%s (original) != %s (new)" % (result[key], request.forms.get(key)) result[key] = value changed = True if changed: #print "METHOD CONFIG CHANGED!!!! (saving)" save_json(config, result) return template('success')
def get_matches_for_tcode(tcode): path = ".cache/tournament_matches/" + tcode + ".json" try: matches = helpers.load_json(path) except: print("Failed to load path") print(path) miss = True if matches is not None: file_creation = os.path.getmtime(path) expiry_time = 3600 * 24 * 2 miss = (matches == [] and time.time() > file_creation + expiry_time) if miss: print("Waiting due to cache miss") sys.stdout.flush() time.sleep(2) endpoint = "/lol/match/v3/matches/by-tournament-code/"+str(tcode)+"/ids" api_key = fetch_api_key() headers = {"X-Riot-Token": api_key} url = M_BASE + endpoint r = requests.get(url, headers=headers) if r.status_code == 404: print("Tcode " + str(tcode) + " does not have any games associated with it.") matches = [] helpers.store_json(matches, path, True) return matches elif r.status_code == 429: print("Hit a retry-after when getting tournament matches. Exiting") exit(0) elif r.status_code != 200: print("Failed to get matches for tcode " + str(tcode)) print(r) return r matches = [] for match_id in r.json(): match = get_tournament_match(match_id, tcode) if match: matches.append(match) helpers.store_json(matches, path, True) return matches
def get_recent_history(accountid, SECRET_API_KEY=None): hist = helpers.load_json(".cache/recent_histories/" + str(accountid) + ".json") if not hist or time.time() > hist["expiry_time"]: if not SECRET_API_KEY: SECRET_API_KEY = read_api_key() if not accountid: raise TypeError("Account id cannot be None") if not isinstance(accountid, int): raise TypeError("Account id must be an int") url = BASE + "/lol/match/v3/matchlists/by-account/" + str( accountid) + "/recent" headers = {"X-Riot-Token": SECRET_API_KEY} r = requests.get(url, headers=headers) if r.status_code == 429: print(r.json()) t = r.json()["Retry-After"] print("Waiting " + str(t) + " seconds and trying again") print("Full response:") print(r.json()) time.sleep(t) return get_match_from_id(matchid, SECRET_API_KEY) if r.status_code != 200: print("Get match history failed") return r response = r.json() if "matches" not in response: print("There is no recent match history") return [] hist = { "expiry_time": time.time() * 60 * 60 * 2, "matches": response["matches"] } helpers.store_json(hist, ".cache/recent_histories/" + str(accountid) + ".json", True) # Expires after 2 hours return hist["matches"] else: return hist["matches"]
def test_subformat_creation_if_missing(api_app, location, datadir, es, users): """Test subformat creation if missing.""" # [[ migrate the video ]] migration_streams = get_migration_streams(datadir=datadir) data = load_json(datadir, 'cds_records_demo_1_video.json') dump = CDSRecordDump(data=data[0]) with mock.patch.object(DataCiteProvider, 'register'), \ mock.patch.object(CDSRecordDumpLoader, '_create_frame', side_effect=get_frames), \ mock.patch.object(ExtractFramesTask, '_create_gif'), \ mock.patch.object(CDSRecordDumpLoader, '_clean_file_list'), \ mock.patch.object( CDSRecordDumpLoader, '_get_migration_file_stream_and_size', side_effect=migration_streams): video = CDSRecordDumpLoader.create(dump=dump) db.session.commit() with mock.patch.object(TranscodeVideoTask, 'run') as mock_transcode: deposit = deposit_video_resolver(video['_deposit']['id']) deposit_id = deposit.id # simulate the missing of a subformat del deposit['_files'][0]['subformat'][0] assert len(deposit['_files'][0]['subformat']) == 4 # recreate 240p format CDSRecordDumpLoader._create_missing_subformats( record=video, deposit=deposit) db.session.commit() # check subformats deposit = Video.get_record(deposit_id) rec_video = record_resolver.resolve(video['recid'])[1] # rec_video = record_resolver.resolve(video['recid'])[1] assert len(deposit['_files'][0]['subformat']) == 5 assert len(rec_video['_files'][0]['subformat']) == 5 # check if transcoding is called properly assert mock_transcode.called is True [(_, call_args)] = mock_transcode.call_args_list assert call_args == {'preset_quality': '240p'}
def read_csv(source_csv): city_options = City.objects.filter(tag="bloomington_in") print "Number of cities available: %s" % len(city_options) if not len(city_options): raise ValueError, "CITY NOT FOUND! run make_cities.py first" ## city = City() ## city.name = "Bloomington" ## city.tag = to_tag(city.name) ## city.save() else: city = city_options[0] print city feed_date = "2013-08-29" feeds = FeedInfo.objects.filter(city=city).filter(added=feed_date) if feeds.exists(): feed = feeds[0] print "Already had feed: %s, %s" % (feed.city, feed.added) else: feed = FeedInfo() feed.city = city feed.added = feed_date feed.version = "0.1" feed.save() print "Created new feed: %s" % feed.city people = Person.objects.filter(name="Blank") if people.exists(): person = people[0] print "Already had person: %s" % (person.name) else: person = Person() person.name = "Blank" person.save() print "Created new person: %s" % person.name sources = Source.objects.filter(feed=feed) if sources.exists(): feed_source = sources[0] print "Already had source: %s, %s" % (feed_source.feed.city, feed_source.feed.added) else: feed_source = Source() feed_source.feed = feed feed_source.person = person feed_source.save() print "Created new source: %s" % feed_source.feed.city cache_file = "%s.json" % city.tag cache_destination = os.path.join(os.path.dirname(source_csv), cache_file) # keep a local copy of data we've processed... # this should help with subsequent calls # to make sure we don't need to duplicate calls to remote geolocation APIs: local_cache = load_json(cache_destination, create=True) if not local_cache.has_key("buildings"): local_cache["buildings"] = {} if not local_cache.has_key("parcels"): local_cache["parcels"] = {} locations = {} for key, value in local_cache["buildings"].items(): locations[key] = Location(value) # geocoder helper: geo = Geo() skips = 0 with codecs.open(source_csv, "rb", encoding="utf-8") as csvfile: # reader = csv.reader(csvfile, delimiter=' ', quotechar='|') reader = csv.reader(csvfile) # just print the first row: print ">, <".join(reader.next()) count = 0 for row in reader: count += 1 print "Looking at row: %s" % count # could exit out early here, if needed if count > 1000: # exit() pass bldg_id = row[0] print bldg_id address = row[1] print address owner = row[2] # skip this: ownder_contact = row[3] agent = row[4] bldg_units = row[9] print bldg_units units_bdrms = row[10] print units_bdrms # check if this is one we want to skip if conversions.has_key(address.upper()): address = conversions[address.upper()] # make sure it's not one we're skipping: if not address: print "SKIPPING ITEM: %s" % row[1] skips += 1 else: if locations.has_key(address.upper()): location = locations[address.upper()] else: location = Location() # temporarily just want to look at google again location.sources = ["google"] # do some geocoding, as needed: search = "%s, Bloomington IN" % address.upper() any_updated = False for geo_source in location.sources: update = geo.lookup(search, geo_source, location, force=True) if update: any_updated = True location.sources = ["google", "bing", "usgeo", "geonames", "openmq", "mq"] if not hasattr(location, "address_alt") or not location.address_alt: any_updated = True location.address_alt = search location.bldg_units = bldg_units location.units_bdrms = units_bdrms locations[address.upper()] = location # handle the database storage bldg = make_building(location, bldg_id, city, feed_source) # owner_details = parse_person(owner) if owner: result = special_cases(owner) if result: (owner_name, owner_address) = result else: (owner_name, owner_address, owner_phone, remainder) = parse_person(owner) ## print "owner name: %s" % owner_name ## print "owner address: %s" % owner_address ## print "" if owner_name: (person, bldg_person) = make_person(owner_name, bldg, "Owner", address=owner_address) if agent and agent != "No Agent": # agent_details = parse_person(agent) (agent_name, agent_address, agent_phone, remainder) = parse_person(agent) ## print "agent name: %s" % agent_name ## print "agent address: %s" % agent_address ## print "" if agent_name: (person, bldg_person) = make_person(agent_name, bldg, "Agent", address=agent_address, city=city) if any_updated: # back it up for later # enable this when downloading GPS coordinates... # the rest of the time it slows things down local_cache["buildings"] = {} for key, value in locations.items(): local_cache["buildings"][key] = value.to_dict() save_json(cache_destination, local_cache) print save_results(locations, "bloomington-filtered.tsv")
from nltk import word_tokenize from nltk.translate.bleu_score import sentence_bleu from nltk.translate.bleu_score import SmoothingFunction decoder_model.load_weights( 'best_models/InceptionV3_5layers/1_checkpoint.keras') image_dir = '../../Desktop/parsingDataset/RSICD_images/' inception_tv_train = np.load( 'image_features/transfer_values/InceptionV3/transfer_values_train.npy') inception_tv_test = np.load( 'image_features/transfer_values/InceptionV3/transfer_values_test.npy') captions_train = load_json('captions_train') def softmax(x): """Compute softmax values for each sets of scores in x.""" e_x = np.exp(x - np.max(x)) return e_x / e_x.sum() chencherry = SmoothingFunction() def bleu(reference, candidate, grade=1): reference_tokenized = word_tokenize(reference) reference_list = list() reference_list.append(reference_tokenized)
def read_csv(source): #for reading unicode #f = codecs.open(source, 'r', encoding='utf-8') city_options = City.objects.filter(tag="ann_arbor") print len(city_options) if not len(city_options): city = City() city.name = "Ann Arbor" city.tag = to_tag(city.name) city.save() else: city = city_options[0] print city #TODO: #setup FeedInfo item #and also create a Source item permit_sub_types = [] status_types = [] building_nums = [] applicants = [] managers = [] cache_file = "%s.json" % city.tag cache_destination = os.path.join(os.path.dirname(source), cache_file) #keep a local copy of data we've processed... #this should help with subsequent calls #to make sure we don't need to duplicate calls to remote geolocation APIs: local_cache = load_json(cache_destination, create=True) if not local_cache.has_key('buildings'): local_cache['buildings'] = {} if not local_cache.has_key('parcels'): local_cache['parcels'] = {} locations = {} for key, value in local_cache['buildings'].items(): locations[key] = Location(value) #geocoder helper: geo = Geo() #with open('eggs.csv', 'rb') as csvfile: with codecs.open(source, 'rb', encoding='utf-8') as csvfile: #reader = csv.reader(csvfile, delimiter=' ', quotechar='|') reader = csv.reader(csvfile) #just print the first row: print '>, <'.join(reader.next()) count = 0 for row in reader: count += 1 #could exit out early here, if needed if count > 10: pass print row #type of building (eg: sf attached, duplex, etc) permit_id = row[0] #should always be "RENTAL" (don't need to track this one) permit_type = row[1] if not permit_type == "RENTAL" and not permit_type == "MECHANICAL": raise ValueError, "Unexpected permit type: %s in row: %s" % ( permit_type, row) sub_type = row[2] #can use this to filter out non-rental or obsolete entries #don't need to track otherwise: status = row[3] parcel_id = row[4] address = row[5] #should be fixed per source: city = row[6] if not ((city.lower() == 'ann arbor') or (city == '')): raise ValueError, "Unexpected city: %s" % (city) sqft = row[7] number_of_buildings = row[8] applicant_name = row[9] number_of_stories = row[10] number_of_units = row[11] if (not status in ['EXPIRED', 'CLOSED']) and (permit_type in ['RENTAL']): #check if we've started processing any results for this row #if local_cache['buildings'].has_key(address.upper()): # local_cache_cur = local_cache['buildings'][address.upper()] #else: # local_cache_cur = {} if locations.has_key(address.upper()): location = locations[address.upper()] else: location = Location() #do some geocoding, as needed: search = "%s, Ann Arbor MI" % address.upper() for source in location.sources: geo.lookup(search, source, location) location.address_alt = search locations[address.upper()] = location #local_cache['buildings'][address.upper()] = local_cache_cur #and check if a previous building object in the db exists #CREATE A NEW BUILDING OBJECT HERE #cur_building = Building() bldg = Building() bldg.type = sub_type #back it up for later local_cache['buildings'] = {} for key, value in locations.items(): local_cache['buildings'][key] = value.to_dict() save_json(cache_destination, local_cache) #exit() #THE FOLLOWING ARE FOR INFORMATIONAL PURPOSES ONLY #(to see what data is available) if not status in status_types: #print "adding: %s" % sub_type status_types.append(status) if not sub_type in permit_sub_types: #print "adding: %s" % sub_type permit_sub_types.append(sub_type) building_num = row[8] if not building_num in building_nums: #print "adding: %s" % sub_type building_nums.append(building_num) applicant = row[9] if (re.search('MGMT', applicant) or re.search('REALTY', applicant) or re.search('PROPERTIES', applicant) or re.search('MANAGEMENT', applicant) or re.search('GROUP', applicant) or re.search('LLC', applicant) or re.search('L.L.C.', applicant) or re.search('INC', applicant)): if not applicant in managers: managers.append(applicant) else: if not applicant in applicants: applicants.append(applicant) #print ', '.join(row) #print ## print permit_sub_types print status_types print building_nums save_results(locations)
def read_csv(source): #for reading unicode #f = codecs.open(source, 'r', encoding='utf-8') city_options = City.objects.filter(tag="ann_arbor") print len(city_options) if not len(city_options): city = City() city.name = "Ann Arbor" city.tag = to_tag(city.name) city.save() else: city = city_options[0] print city #TODO: #setup FeedInfo item #and also create a Source item permit_sub_types = [] status_types = [] building_nums = [] applicants = [] managers = [] cache_file = "%s.json" % city.tag cache_destination = os.path.join(os.path.dirname(source), cache_file) #keep a local copy of data we've processed... #this should help with subsequent calls #to make sure we don't need to duplicate calls to remote geolocation APIs: local_cache = load_json(cache_destination, create=True) if not local_cache.has_key('buildings'): local_cache['buildings'] = {} if not local_cache.has_key('parcels'): local_cache['parcels'] = {} locations = {} for key, value in local_cache['buildings'].items(): locations[key] = Location(value) #geocoder helper: geo = Geo() #with open('eggs.csv', 'rb') as csvfile: with codecs.open(source, 'rb', encoding='utf-8') as csvfile: #reader = csv.reader(csvfile, delimiter=' ', quotechar='|') reader = csv.reader(csvfile) #just print the first row: print '>, <'.join(reader.next()) count = 0 for row in reader: count += 1 #could exit out early here, if needed if count > 10: pass print row #type of building (eg: sf attached, duplex, etc) permit_id = row[0] #should always be "RENTAL" (don't need to track this one) permit_type = row[1] if not permit_type == "RENTAL" and not permit_type == "MECHANICAL": raise ValueError, "Unexpected permit type: %s in row: %s" % ( permit_type, row) sub_type = row[2] #can use this to filter out non-rental or obsolete entries #don't need to track otherwise: status = row[3] parcel_id = row[4] address = row[5] #should be fixed per source: city = row[6] if not ( (city.lower() == 'ann arbor') or (city == '') ): raise ValueError, "Unexpected city: %s" % (city) sqft = row[7] number_of_buildings = row[8] applicant_name = row[9] number_of_stories = row[10] number_of_units = row[11] if (not status in ['EXPIRED', 'CLOSED']) and (permit_type in ['RENTAL']): #check if we've started processing any results for this row #if local_cache['buildings'].has_key(address.upper()): # local_cache_cur = local_cache['buildings'][address.upper()] #else: # local_cache_cur = {} if locations.has_key(address.upper()): location = locations[address.upper()] else: location = Location() #do some geocoding, as needed: search = "%s, Ann Arbor MI" % address.upper() for source in location.sources: geo.lookup(search, source, location) location.address_alt = search locations[address.upper()] = location #local_cache['buildings'][address.upper()] = local_cache_cur #and check if a previous building object in the db exists #CREATE A NEW BUILDING OBJECT HERE #cur_building = Building() bldg = Building() bldg.type = sub_type #back it up for later local_cache['buildings'] = {} for key, value in locations.items(): local_cache['buildings'][key] = value.to_dict() save_json(cache_destination, local_cache) #exit() #THE FOLLOWING ARE FOR INFORMATIONAL PURPOSES ONLY #(to see what data is available) if not status in status_types: #print "adding: %s" % sub_type status_types.append(status) if not sub_type in permit_sub_types: #print "adding: %s" % sub_type permit_sub_types.append(sub_type) building_num = row[8] if not building_num in building_nums: #print "adding: %s" % sub_type building_nums.append(building_num) applicant = row[9] if ( re.search('MGMT', applicant) or re.search('REALTY', applicant) or re.search('PROPERTIES', applicant) or re.search('MANAGEMENT', applicant) or re.search('GROUP', applicant) or re.search('LLC', applicant) or re.search('L.L.C.', applicant) or re.search('INC', applicant) ): if not applicant in managers: managers.append(applicant) else: if not applicant in applicants: applicants.append(applicant) #print ', '.join(row) #print ## print permit_sub_types print status_types print building_nums save_results(locations)
def read_csv(source_csv, city_name, city_tag): city_options = City.objects.filter(tag=city_tag) print "Number of cities available: %s" % len(city_options) if not len(city_options): raise ValueError, "CITY NOT FOUND! run make_cities.py first" ## city = City() ## city.name = city_name ## city.tag = to_tag(city.name) ## city.save() else: city = city_options[0] print city feed_date = "2013-07-31" feeds = FeedInfo.objects.filter(city=city).filter(added=feed_date) if feeds.exists(): feed = feeds[0] print "Already had feed: %s, %s" % (feed.city, feed.added) else: feed = FeedInfo() feed.city = city feed.added = feed_date feed.version = "0.1" feed.save() print "Created new feed: %s" % feed.city.name people = Person.objects.filter(name="Blank") if people.exists(): person = people[0] print "Already had person: %s" % (person.name) else: person = Person() person.name = "Blank" person.save() print "Created new person: %s" % person.name sources = Source.objects.filter(feed=feed) if sources.exists(): feed_source = sources[0] print "Already had source: %s, %s" % (feed_source.feed.city, feed_source.feed.added) else: feed_source = Source() feed_source.feed = feed feed_source.person = person feed_source.save() print "Created new source: %s" % feed_source.feed.city.name cache_file = "%s.json" % city.tag cache_destination = os.path.join(os.path.dirname(source_csv), cache_file) #keep a local copy of data we've processed... #this should help with subsequent calls #to make sure we don't need to duplicate calls to remote geolocation APIs: local_cache = load_json(cache_destination, create=True) if not local_cache.has_key('buildings'): local_cache['buildings'] = {} if not local_cache.has_key('parcels'): local_cache['parcels'] = {} locations = {} for key, value in local_cache['buildings'].items(): locations[key] = Location(value) #geocoder helper: geo = Geo() skips = 0 #with codecs.open(source_csv, 'rb', encoding='utf-8') as csvfile: with open(source_csv) as csvfile: #reader = csv.reader(csvfile, delimiter=' ', quotechar='|') #reader = csv.reader(csvfile) #reader = unicodecsv.UnicodeReader(csvfile, encoding='utf-8') reader = unicode_csv_reader(csvfile) #just print the first row: print '>, <'.join(reader.next()) count = 0 for row in reader: count += 1 print "Looking at row: %s" % count #could exit out early here, if needed if count > 1000: #exit() pass address = row[0] #need to fix the number being at the end of the address parts = address.split(',') anumber = parts[-1] parts = parts[:-1] street = ",".join(parts) address = "%s %s" % (anumber, street) invoice_number = row[1] bldg_id = row[1] print bldg_id #this is where owner is stored invoice_note = row[6] print invoice_note if re.match('Sent to:', invoice_note): print "changing invoice note from: %s" % invoice_note invoice_note = invoice_note[8:] print "to: %s" % invoice_note else: #raise ValueError, "invoice note does not start with Sent to" print "!!!!!invoice note does not start with Sent to!!!!!" print "" print "" no_units = row[12] ## #should always be "RENTAL" (don't need to track this one) ## permit_type = row[1] ## if not permit_type == "RENTAL" and not permit_type == "MECHANICAL": ## raise ValueError, "Unexpected permit type: %s in row: %s" % ( ## permit_type, row) ## bldg_type = row[2] ## #can use this to filter out non-rental or obsolete entries ## #don't need to track otherwise: ## status = row[3] ## parcel_id = row[4] ## #should be fixed per source: ## ss_city = row[6] ## bldg_sf = row[7] ## no_bldgs = row[8] ## applicant_name = row[9] ## no_stories = row[10] ## no_units = row[11] ## if not ( (ss_city.lower() == city_name.lower()) or (ss_city == '') ): ## raise ValueError, "Unexpected city: %s" % (ss_city) ## sqft = row[7] ## number_of_buildings = row[8] ## applicant_name = row[9] ## number_of_stories = row[10] ## number_of_units = row[11] #check if this is one we want to skip if conversions.has_key(address.upper()): address = conversions[address.upper()] ## if (not status in ['EXPIRED', 'CLOSED']) and (permit_type in ['RENTAL']): #make sure it's not one we're skipping: if not address: print "SKIPPING ITEM: %s" % row[1] skips += 1 else: #check if we've started processing any results for this row if locations.has_key(address.upper()): location = locations[address.upper()] else: location = Location() #temporarily just want to look at google again #location.sources = ["google"] #location.sources = ["google", "bing", "usgeo", "geonames", "openmq"] location.sources = ["google", "bing"] #do some geocoding, as needed: search = "%s, %s, %s" % (address.upper(), city_name, city.state) any_updated = False for geo_source in location.sources: update = geo.lookup(search, geo_source, location, force=True) #update = geo.lookup(search, geo_source, location, force=False) if update: any_updated = True location.sources = ["google", "bing", "usgeo", "geonames", "openmq", "mq"] #this is the case for brand new searches #(which are updated in a different sense) if not hasattr(location, "address_alt") or not location.address_alt: any_updated = True location.address_alt = search #location.bldg_units = bldg_units #location.units_bdrms = units_bdrms locations[address.upper()] = location #handle the database storage bldg = make_building(location, bldg_id, city, feed_source, no_units=no_units) if invoice_note: (person, bldg_person) = make_person(invoice_note, bldg, "Permit Applicant") if any_updated: #back it up for later #enable this when downloading GPS coordinates... #the rest of the time it slows things down local_cache['buildings'] = {} for key, value in locations.items(): local_cache['buildings'][key] = value.to_dict() save_json(cache_destination, local_cache) print destination = '%s.tsv' % city_tag save_results(locations, destination)
def read_csv(source_csv, city_name, city_tag): city_options = City.objects.filter(tag=city_tag) print "Number of cities available: %s" % len(city_options) if not len(city_options): raise ValueError, "CITY NOT FOUND! run make_cities.py first" ## city = City() ## city.name = city_name ## city.tag = to_tag(city.name) ## city.save() else: city = city_options[0] print city feed_date = "2013-07-31" feeds = FeedInfo.objects.filter(city=city).filter(added=feed_date) if feeds.exists(): feed = feeds[0] print "Already had feed: %s, %s" % (feed.city, feed.added) else: feed = FeedInfo() feed.city = city feed.added = feed_date feed.version = "0.1" feed.save() print "Created new feed: %s" % feed.city.name people = Person.objects.filter(name="Blank") if people.exists(): person = people[0] print "Already had person: %s" % (person.name) else: person = Person() person.name = "Blank" person.save() print "Created new person: %s" % person.name sources = Source.objects.filter(feed=feed) if sources.exists(): feed_source = sources[0] print "Already had source: %s, %s" % (feed_source.feed.city, feed_source.feed.added) else: feed_source = Source() feed_source.feed = feed feed_source.person = person feed_source.save() print "Created new source: %s" % feed_source.feed.city.name cache_file = "%s.json" % city.tag cache_destination = os.path.join(os.path.dirname(source_csv), cache_file) #keep a local copy of data we've processed... #this should help with subsequent calls #to make sure we don't need to duplicate calls to remote geolocation APIs: local_cache = load_json(cache_destination, create=True) if not local_cache.has_key('buildings'): local_cache['buildings'] = {} if not local_cache.has_key('parcels'): local_cache['parcels'] = {} locations = {} for key, value in local_cache['buildings'].items(): locations[key] = Location(value) #geocoder helper: geo = Geo() skips = 0 #with codecs.open(source_csv, 'rb', encoding='utf-8') as csvfile: with open(source_csv) as csvfile: #reader = csv.reader(csvfile, delimiter=' ', quotechar='|') #reader = csv.reader(csvfile) #reader = unicodecsv.UnicodeReader(csvfile, encoding='utf-8') reader = unicode_csv_reader(csvfile) #just print the first row: print '>, <'.join(reader.next()) count = 0 for row in reader: count += 1 print "Looking at row: %s" % count #could exit out early here, if needed if count > 1000: #exit() pass address = row[0] #need to fix the number being at the end of the address parts = address.split(',') anumber = parts[-1] parts = parts[:-1] street = ",".join(parts) address = "%s %s" % (anumber, street) invoice_number = row[1] bldg_id = row[1] print bldg_id #this is where owner is stored invoice_note = row[6] print invoice_note if re.match('Sent to:', invoice_note): print "changing invoice note from: %s" % invoice_note invoice_note = invoice_note[8:] print "to: %s" % invoice_note else: #raise ValueError, "invoice note does not start with Sent to" print "!!!!!invoice note does not start with Sent to!!!!!" print "" print "" no_units = row[12] ## #should always be "RENTAL" (don't need to track this one) ## permit_type = row[1] ## if not permit_type == "RENTAL" and not permit_type == "MECHANICAL": ## raise ValueError, "Unexpected permit type: %s in row: %s" % ( ## permit_type, row) ## bldg_type = row[2] ## #can use this to filter out non-rental or obsolete entries ## #don't need to track otherwise: ## status = row[3] ## parcel_id = row[4] ## #should be fixed per source: ## ss_city = row[6] ## bldg_sf = row[7] ## no_bldgs = row[8] ## applicant_name = row[9] ## no_stories = row[10] ## no_units = row[11] ## if not ( (ss_city.lower() == city_name.lower()) or (ss_city == '') ): ## raise ValueError, "Unexpected city: %s" % (ss_city) ## sqft = row[7] ## number_of_buildings = row[8] ## applicant_name = row[9] ## number_of_stories = row[10] ## number_of_units = row[11] #check if this is one we want to skip if conversions.has_key(address.upper()): address = conversions[address.upper()] ## if (not status in ['EXPIRED', 'CLOSED']) and (permit_type in ['RENTAL']): #make sure it's not one we're skipping: if not address: print "SKIPPING ITEM: %s" % row[1] skips += 1 else: #check if we've started processing any results for this row if locations.has_key(address.upper()): location = locations[address.upper()] else: location = Location() #temporarily just want to look at google again #location.sources = ["google"] #location.sources = ["google", "bing", "usgeo", "geonames", "openmq"] location.sources = ["google", "bing"] #do some geocoding, as needed: search = "%s, %s, %s" % (address.upper(), city_name, city.state) any_updated = False for geo_source in location.sources: update = geo.lookup(search, geo_source, location, force=True) #update = geo.lookup(search, geo_source, location, force=False) if update: any_updated = True location.sources = [ "google", "bing", "usgeo", "geonames", "openmq", "mq" ] #this is the case for brand new searches #(which are updated in a different sense) if not hasattr(location, "address_alt") or not location.address_alt: any_updated = True location.address_alt = search #location.bldg_units = bldg_units #location.units_bdrms = units_bdrms locations[address.upper()] = location #handle the database storage bldg = make_building(location, bldg_id, city, feed_source, no_units=no_units) if invoice_note: (person, bldg_person) = make_person(invoice_note, bldg, "Permit Applicant") if any_updated: #back it up for later #enable this when downloading GPS coordinates... #the rest of the time it slows things down local_cache['buildings'] = {} for key, value in locations.items(): local_cache['buildings'][key] = value.to_dict() save_json(cache_destination, local_cache) print destination = '%s.tsv' % city_tag save_results(locations, destination)
def input_data(self): if not self._input_data: self._input_data = load_json(self._inputfile) return self._input_data
def create_table_command(json_file='table_columns.json', table_name='congress_bills'): col_data = load_json(json_file) ct = CreateTable(table_name, col_data) return ct.parse_to_create_table_command()
def read_csv(source_csv, city_tag, feed_date): #could also use city.models.find_by_city_state city_options = City.objects.filter(tag=city_tag) #print "Number of cities available: %s" % len(city_options) if not len(city_options): raise ValueError, "CITY NOT FOUND! run make_cities.py first" else: city = city_options[0] print city feeds = FeedInfo.objects.filter(city=city).filter(added=feed_date) if feeds.exists(): feed = feeds[0] print "Already had feed: %s, %s" % (feed.city, feed.added) else: feed = FeedInfo() feed.city = city feed.added = feed_date feed.version = "0.1" feed.save() print "Created new feed: %s" % feed.city.name people = Person.objects.filter(name="Blank") if people.exists(): person = people[0] print "Already had person: %s" % (person.name) else: person = Person() person.name = "Blank" person.save() print "Created new person: %s" % person.name sources = Source.objects.filter(feed=feed) if sources.exists(): feed_source = sources[0] print "Already had source: %s, %s" % (feed_source.feed.city, feed_source.feed.added) else: feed_source = Source() feed_source.feed = feed feed_source.person = person feed_source.save() print "Created new source: %s" % feed_source.feed.city.name # ideally, should be able to use the database itself as the cache, # instead of using a local file # but it's also good to not have to repeat geo queries if going in bulk # the site code *will* make geo queries # so it's still a good idea to cache the coded address locally # even if using the site code for everything else. cache_file = "%s.json" % city.tag #print cache_file cache_destination = os.path.join(os.path.dirname(source_csv), cache_file) print cache_destination #keep a local copy of data we've processed... #this should help with subsequent calls #to make sure we don't need to duplicate calls to remote geolocation APIs: loaded_cache = load_json(cache_destination, create=True) #need to go through and load SearchResults separately local_cache = {} for key in loaded_cache.keys(): #this is useful if there is a cached value #that was not parsed correctly... this will remove it: #if key.strip() == "314 North Washington Street Apt. C": if key.strip() == "some address with bad cached data": print "not adding: ", key #exit() pass else: current = loaded_cache[key] results = current['results'] #print results sr = SearchResults() #sr.from_dict(results, debug=True) sr.from_dict(results, debug=False) #print sr current['results'] = sr #print current['results'] local_cache[key] = current #use street address as the key #for each address, store SearchResults object #reset skips for every run: skips = codecs.open("skips.txt", 'w', encoding='utf-8') skips.close() skips = 0 #with codecs.open(source_csv, 'rb', encoding='utf-8') as csvfile: with open(source_csv) as csvfile: #reader = csv.reader(csvfile, delimiter=' ', quotechar='|') #reader = csv.reader(csvfile) #reader = unicodecsv.UnicodeReader(csvfile, encoding='utf-8') reader = unicode_csv_reader(csvfile) #just print the first row: print '>, <'.join(reader.next()) print keys = [] for item in reader.next(): key = item.lower().strip() key = key.replace('(', '') key = key.replace(')', '') key = key.replace('-', '_') key = key.replace('.', '') key = key.replace('/ ', '') key = key.replace('/', '_') key = key.replace('"', '') key = key.replace('#', 'num') key = key.replace(' ', '_') keys.append(key) #*and* the second row in this case print '>, <'.join(keys) #currently: #<street_address>, <unit_if_applicable>, <unit_type>, <rent>, <security_deposit>, <sq_feet_per_unit>, <num_bedrooms>, <num_bathrooms>, <maximum_occupancy_per_unit>, <lease_period>, <availability>, <laundry>, <parking>, <air_conditioning>, <pets>, <gym_fitness_center>, <game_room_rec_center_community_center>, <pool>, <other_amenities>, <bike_friendly>, <recycling>, <composting>, <gardening>, <public_transit>, <walk_friendly>, <other_smartliving_features>, <who_pays_for_electricity>, <who_pays_for_natural_gas>, <who_pays_for_water>, <who_pays_for_trash_recycling_pickup>, <who_pays_for_telephone_land_line>, <who_pays_for_cable>, <who_pays_for_internet>, <electricity_provider>, <electric_utility_cost_average_per_mo>, <electric_utility_cost_low>, <electric_utility_cost_high>, <natural_gas_provider>, <natural_gas_utility_cost_average_per_mo>, <natural_gas_utility_cost_low>, <natural_gas_utility_cost_high>, <energy_saving_features>, <utility_info_source>, <agent_property_manager>, <property_website_url>, <agent_property_manager_address>, <agent_property_manager_phone>, <owner>, <comments> #exit() count = 0 #start = 6439 start = 0 #if you want to randomize the order... to distribute options more evenly #just do this in the original spreadsheet. #in order to randomize, should randomize the order in the csv for row in reader: current = {} count += 1 print "Looking at row: %s" % count #could exit out early here, if needed (for testing) if count > 7220: #all_done(cache_destination, local_cache) pass if count >= start: address = process_row(current, row, keys, local_cache, city, feed_source, count) print local_cache[address] = current #save every time... #never know when a crash will happen: #however, this does make things run considerably slower #especially once the cached file size grows. #save_results(cache_destination, local_cache) #exit() all_done(cache_destination, local_cache)
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "rentrocket.settings") ## from rentrocket import settings ## from django.core.management import setup_environ ## setup_environ(settings) from city.models import City, to_tag from helpers import save_json, load_json, Location, Geo, save_results, make_building cache_file = "cities.json" cache_destination = os.path.join(os.path.dirname(__file__), cache_file) #keep a local copy of data we've processed... #this should help with subsequent calls #to make sure we don't need to duplicate calls to remote geolocation APIs: saved_cities = load_json(cache_destination, create=True) #geocoder helper: geo = Geo() cities = [ ['Bloomington', 'IN', '', ''], ['Ann Arbor', 'MI', '', ''], ['Albany', 'NY', '', ''], ['Iowa City', 'IA', '', ''], ['Burlington', 'VT', '', ''], ['Austin', 'TX', '', ''], ['Columbia', 'MO', '', ''], ['Madison', 'WI', '', ''], ['Lawrence', 'KS', '', ''], ['Berkeley', 'CA', '', ''],
def load(self, str_data): try: self.data = helpers.load_json(str_data) except ValueError: return False return True
from helpers import load_json from helpers import print_progress from copy import copy import json from nltk import word_tokenize from nltk.translate.bleu_score import sentence_bleu from nltk.translate.bleu_score import SmoothingFunction #from tensorflow.python.keras.models import Model transfer_values_train = np.load( 'image_features/transfer_values/InceptionV3/transfer_values_train.npy') transfer_values_test = np.load( 'image_features/transfer_values/InceptionV3/transfer_values_test.npy') captions_train = load_json('captions_train') # LOAD THE CANDIDATE CAPTIONS BS_filename = 'InceptionCaptions/9_beamsearched.json' with open(BS_filename, 'r') as f: candidate_captions = json.load(f) # Load the transfer model # After first execution, you can comment these lines #from tensorflow.python.keras.applications import VGG16 #image_model = VGG16(include_top=True, weights='imagenet') #transfer_layer=image_model.get_layer('fc2') #image_model_transfer = Model(inputs=image_model.input, # outputs=transfer_layer.output) image_dir = 'UAV/images/' filenames_test = load_json('filenames_test')
def load( self, str_data): try: self.data = helpers.load_json( str_data) except ValueError: return False return True
## from rentrocket import settings ## from django.core.management import setup_environ ## setup_environ(settings) from city.models import City, to_tag from helpers import save_json, load_json, Location, Geo, save_results, make_building cache_file = "cities.json" cache_destination = os.path.join(os.path.dirname(__file__), cache_file) #keep a local copy of data we've processed... #this should help with subsequent calls #to make sure we don't need to duplicate calls to remote geolocation APIs: saved_cities = load_json(cache_destination, create=True) #geocoder helper: geo = Geo() cities = [ ['Bloomington', 'IN', '', ''], ['Ann Arbor', 'MI', '', ''], ['Albany', 'NY', '', ''], ['Iowa City', 'IA', '', ''], ['Burlington', 'VT', '', ''], ['Austin', 'TX', '', ''], ['Columbia', 'MO', '', ''], ['Madison', 'WI', '', ''], ['Lawrence', 'KS', '', ''], ['Berkeley', 'CA', '', ''],
def test_migrate_record(frames_required, api_app, location, datadir, es, users): """Test migrate date.""" # [[ migrate the project ]] data = load_json(datadir, 'cds_records_demo_1_project.json') dump = CDSRecordDump(data=data[0]) project = CDSRecordDumpLoader.create(dump=dump) p_id = project.id assert project['$schema'] == Project.get_record_schema() assert project['publication_date'] == '2016-01-05' assert 'license' not in project assert 'copyright' not in project assert project['_cds'] == { "state": { "file_transcode": "SUCCESS", "file_video_extract_frames": "SUCCESS", "file_video_metadata_extraction": "SUCCESS" }, 'modified_by': users[0], } # check project deposit deposit_project_uuid = PersistentIdentifier.query.filter_by( pid_type='depid', object_type='rec').one().object_uuid deposit_project = Record.get_record(deposit_project_uuid) assert Project._schema in deposit_project['$schema'] assert project.revision_id == deposit_project[ '_deposit']['pid']['revision_id'] assert deposit_project['_deposit']['created_by'] == 1 assert deposit_project['_deposit']['owners'] == [1] assert deposit_project['_files'] == [] # [[ migrate the video ]] data = load_json(datadir, 'cds_records_demo_1_video.json') dump = CDSRecordDump(data=data[0]) db.session.commit() def check_symlinks(video): symlinks_creator = SymlinksCreator() files = list(symlinks_creator._get_list_files(record=video)) assert len(files) == 1 for file_ in files: path = symlinks_creator._build_link_path( symlinks_creator._symlinks_location, video, file_['key']) assert os.path.lexists(path) def check_gif(video, mock_gif): # called only once for deposit (_, _, mock_args) = mock_gif.mock_calls[0] # check gif record video = CDSRecord(dict(video), video.model) # check gif deposit deposit = deposit_video_resolver(video['_deposit']['id']) master_video = CDSVideosFilesIterator.get_master_video_file(deposit) assert mock_args['master_id'] == master_video['version_id'] assert str(deposit.files.bucket.id) == mock_args['bucket'] # assert mock_args['bucket'].id == deposit.files.bucket.id assert len(mock_args['frames']) == 10 assert 'output_dir' in mock_args migration_streams = get_migration_streams(datadir=datadir) with mock.patch.object(DataCiteProvider, 'register'), \ mock.patch.object(CDSRecordDumpLoader, '_create_frame', side_effect=get_frames), \ mock.patch.object(CDSRecordDumpLoader, '_get_minimum_frames', return_value=frames_required) as mock_frames, \ mock.patch.object( ExtractFramesTask, '_create_gif') as mock_gif, \ mock.patch.object( CDSRecordDumpLoader, '_get_migration_file_stream_and_size', side_effect=migration_streams), \ mock.patch.object(CDSRecordDumpLoader, '_clean_file_list'): video = CDSRecordDumpLoader.create(dump=dump) assert mock_frames.called is True db.session.add(video.model) video_id = video.id # check smil file smil_obj = ObjectVersion.query.filter_by( key='CERN-MOVIE-2012-193-001.smil', is_head=True).one() storage = smil_obj.file.storage() assert '<video src' in storage.open().read().decode('utf-8') # check video symlinks check_symlinks(video) # check gif check_gif(video, mock_gif) # check project project = Record.get_record(p_id) assert project['videos'] == [ {'$ref': 'https://cds.cern.ch/api/record/1495143'} ] assert video['$schema'] == Video.get_record_schema() assert video['date'] == '2012-11-21' # metadata data assert video['publication_date'] == '2017-07-13' # creation date (DB) assert video['_project_id'] == '2093596' assert video['license'] == [{ 'license': 'CERN', 'url': 'http://copyright.web.cern.ch', }] assert video['copyright'] == { 'holder': 'CERN', 'year': '2012', 'url': 'http://copyright.web.cern.ch', } assert video['description'] == '' assert 'doi' in video assert video['_cds']['state'] == { "file_transcode": "SUCCESS", "file_video_extract_frames": "SUCCESS", "file_video_metadata_extraction": "SUCCESS" } assert 'extracted_metadata' in video['_cds'] def check_files(video): bucket = CDSRecordDumpLoader._get_bucket(record=video) files = [dump_object(obj) for obj in ObjectVersion.get_by_bucket(bucket=bucket)] for file_ in files: assert as_bucket(file_['bucket_id']) is not None assert 'checksum' in file_ assert 'content_type' in file_ assert 'context_type' in file_ assert FileInstance.query.filter_by( id=file_['file_id']) is not None assert 'key' in file_ assert 'links' in file_ assert 'content_type' in file_ assert 'context_type' in file_ assert 'media_type' in file_ assert 'tags' in file_ # check extracted metadata master_video = CDSVideosFilesIterator.get_master_video_file(video) assert any([key in master_video['tags'] for key in ExtractMetadataTask._all_keys]) assert any([key in video['_cds']['extracted_metadata'] for key in ExtractMetadataTask._all_keys]) def check_buckets(record, deposit): def get(key, record): bucket = CDSRecordDumpLoader._get_bucket(record=record) files = [dump_object(obj) for obj in ObjectVersion.get_by_bucket(bucket=bucket)] return [file_[key] for file_ in files] def check(record, deposit, file_key, different=None): values_record = set(get(file_key, record)) values_deposit = set(get(file_key, deposit)) difference = len(values_record - values_deposit) assert different == difference def check_tag_master(record): bucket = CDSRecordDumpLoader._get_bucket(record=record) master = CDSVideosFilesIterator.get_master_video_file(record) files = [dump_object(obj) for obj in ObjectVersion.get_by_bucket(bucket=bucket) if obj.get_tags().get('master')] assert all([file_['tags']['master'] == master['version_id'] for file_ in files]) # 1 bucket record != 1 bucket deposit check(record, deposit, 'bucket_id', 1) # all file_id are the same except the smil file (only in record) check(record, deposit, 'file_id', 1) check(record, deposit, 'key', 1) # 18 object_version record != 17 object_version deposit check(record, deposit, 'version_id', 18) # check tag 'master' where is pointing check_tag_master(record) check_tag_master(deposit) def check_first_level_files(record): [master] = [file_ for file_ in deposit_video['_files'] if file_['context_type'] == 'master'] assert len(master['subformat']) == 5 assert len(master['frame']) == 10 # TODO assert len(master['playlist']) == ?? assert len([file_ for file_ in deposit_video['_files'] if file_['context_type'] == 'master']) == 1 duration = float(record['_cds']['extracted_metadata']['duration']) for frame in master['frame']: assert float(frame['tags']['timestamp']) < duration assert float(frame['tags']['timestamp']) > 0 # check tag 'preset_quality' pqs = [form['tags']['preset_quality'] for form in master['subformat']] assert sorted(pqs) == sorted(['1080p', '240p', '360p', '480p', '720p']) # check tag 'display_aspect_ratio' dar = set([form['tags']['display_aspect_ratio'] for form in master['subformat']]) assert dar == {'16:9'} def check_pids(record): """Check pids.""" assert record['report_number'][0] == 'CERN-VIDEO-2012-193-001' assert PersistentIdentifier.query.filter_by( pid_value='CERN-VIDEO-2012-193-001').count() == 1 assert PersistentIdentifier.query.filter_by( pid_value='CERN-MOVIE-2012-193-001').count() == 1 db.session.commit() # check video deposit deposit_video_uuid = PersistentIdentifier.query.filter( PersistentIdentifier.pid_type == 'depid', PersistentIdentifier.object_uuid != str(deposit_project_uuid), PersistentIdentifier.object_type == 'rec' ).one().object_uuid deposit_video = Video.get_record(str(deposit_video_uuid)) assert Video._schema in deposit_video['$schema'] video = Record.get_record(video_id) assert video.revision_id == deposit_video[ '_deposit']['pid']['revision_id'] assert deposit_video['_deposit']['created_by'] == users[0] assert deposit_video['_deposit']['owners'] == [users[0]] assert deposit_video['_project_id'] == '2093596' assert len(video['_files']) == 2 assert len(deposit_video['_files']) == 2 check_files(video) check_files(deposit_video) check_buckets(video, deposit_video) check_first_level_files(video) check_first_level_files(deposit_video) check_pids(video) # try to edit video deposit_video = deposit_video_resolver(deposit_video['_deposit']['id']) deposit_video = deposit_video.edit() # try to edit project deposit_project = deposit_project_resolver( deposit_project['_deposit']['id']) deposit_project = deposit_project.edit() login_user(User.query.filter_by(id=users[0]).first()) deposit_video['title']['title'] = 'test' deposit_video = deposit_video.publish() _, record_video = deposit_video.fetch_published() assert record_video['title']['title'] == 'test'
# This code computes the bleu score for the candidate sentence import nltk import math from nltk import word_tokenize from nltk.translate.bleu_score import sentence_bleu from nltk.translate.bleu_score import corpus_bleu from nltk.translate.bleu_score import SmoothingFunction import json from helpers import load_json #from NN_architecture import generate_caption # run the NN architecture before captions_test = load_json('captions_test') ##generate_caption(path+filenames_test[0]) ##with open('generated_captions_VGG19.txt') as inFile: #with open('captions_vgg16/4_generated_captions_VGG16.txt') as inFile: # generated_test_captions=inFile.readlines() #for i in range(len(generated_test_captions)): ## THIS LINE REMOVES THE FIRST EMPTY SPACE ## generated_test_captions[i]=generated_test_captions[i][1:] # generated_test_captions[i]=generated_test_captions[i].replace('\n','') # # load from json with open('captions_vgg16/12_generated_captions_VGG16.json') as inFile: generated_test_captions = json.load(inFile) c_to_insert = generated_test_captions[883] generated_test_captions.insert(884, c_to_insert)
def read_csv(source_csv, city_name, city_tag): city_options = City.objects.filter(tag=city_tag) print "Number of cities available: %s" % len(city_options) if not len(city_options): raise ValueError, "CITY NOT FOUND! run make_cities.py first" ## city = City() ## city.name = city_name ## city.tag = to_tag(city.name) ## city.save() else: city = city_options[0] print city feed_date = "2013-10-16" feeds = FeedInfo.objects.filter(city=city).filter(added=feed_date) if feeds.exists(): feed = feeds[0] print "Already had feed: %s, %s" % (feed.city, feed.added) else: feed = FeedInfo() feed.city = city feed.added = feed_date feed.version = "0.1" feed.save() print "Created new feed: %s" % feed.city.name people = Person.objects.filter(name="Blank") if people.exists(): person = people[0] print "Already had person: %s" % (person.name) else: person = Person() person.name = "Blank" person.save() print "Created new person: %s" % person.name sources = Source.objects.filter(feed=feed) if sources.exists(): feed_source = sources[0] print "Already had source: %s, %s" % (feed_source.feed.city, feed_source.feed.added) else: feed_source = Source() feed_source.feed = feed feed_source.person = person feed_source.save() print "Created new source: %s" % feed_source.feed.city.name cache_file = "%s.json" % city.tag cache_destination = os.path.join(os.path.dirname(source_csv), cache_file) #keep a local copy of data we've processed... #this should help with subsequent calls #to make sure we don't need to duplicate calls to remote geolocation APIs: local_cache = load_json(cache_destination, create=True) if not local_cache.has_key('buildings'): local_cache['buildings'] = {} if not local_cache.has_key('parcels'): local_cache['parcels'] = {} locations = {} for key, value in local_cache['buildings'].items(): locations[key] = Location(value) #geocoder helper: geo = Geo() skips = 0 #with codecs.open(source_csv, 'rb', encoding='utf-8') as csvfile: with open(source_csv) as csvfile: #reader = csv.reader(csvfile, delimiter=' ', quotechar='|') #reader = csv.reader(csvfile) #reader = unicodecsv.UnicodeReader(csvfile, encoding='utf-8') reader = unicode_csv_reader(csvfile) #just print the first row: print '>, <'.join(reader.next()) count = 0 #want to randomize the order... distribute options more evenly #print len(reader) #exit() #in order to randomize, should randomize the order in the csv for row in reader: count += 1 print "Looking at row: %s" % count #could exit out early here, if needed if count > 10: #exit() pass print row address = row[0] ## no_units = row[12] #can pass this in as bldg_id to make_building #that gets used for parcel too parcel_id = row[1] bldg_id = parcel_id street_num = row[2] street_dir = row[3] street_name = row[4] street_sfx = row[5] #eg building number qualifier_pre = row[6] #eg "UNIT" or "APT" qualifier_post = row[7] apt_num = row[8] #skip row9 (in/out... whatever that means) zip_code = row[10] #skip row11, assessor id #skip row12, address num #skip row13, x #skip row14, y #xcoord == lng lng = row[15] lat = row[16] #entry floor number: (named 'z' in sheet) floor = row[17] #skip row18, strcid... not sure #skip row19, parent #skip row20, app_ #skip row21, hteloc zone = row[22] bldg_type = row[23] #number of buildings bldg_num = row[24] no_units = row[25] #skip row[26], inspection type #skip row27, app number #skip row28, date received #skip row29, application type #skip row30, ownerid #skip row31, operator id #skip row32, agent_id #skip row33, mail to central_heat = row[34] if central_heat == 'Y': central_heat = True else: central_heat = False #heat mechanism? heat mechanic??? not sure heat_mech = row[35] #skip row36, agent id (2) #skip row37, agent last name #skip row38 agent first name #skip row39 agent middle initial #skip row40, agent title #skip row41, business name #could be owner, could be agent owner_name = row[42] owner_address1 = row[43] owner_address2 = row[44] owner_city = row[45] owner_state = row[46] owner_zip = row[47] #address = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre, qualifier_post, apt_num]) address_main = " ".join([ street_num, street_dir, street_name, street_sfx, qualifier_pre ]) address_main = address_main.strip() #get rid of any double spaces address_main = address_main.replace(" ", " ") apt_main = " ".join([qualifier_post, apt_num]) apt_main = apt_main.strip() address = address_main print address owner_address = ", ".join([ owner_address1, owner_address2, owner_city, owner_state, owner_zip ]) ## #should always be "RENTAL" (don't need to track this one) ## permit_type = row[1] ## if not permit_type == "RENTAL" and not permit_type == "MECHANICAL": ## raise ValueError, "Unexpected permit type: %s in row: %s" % ( ## permit_type, row) ## bldg_type = row[2] ## #can use this to filter out non-rental or obsolete entries ## #don't need to track otherwise: ## status = row[3] ## parcel_id = row[4] ## #should be fixed per source: ## ss_city = row[6] ## bldg_sf = row[7] ## no_bldgs = row[8] ## applicant_name = row[9] ## no_stories = row[10] ## no_units = row[11] ## sqft = row[7] ## number_of_buildings = row[8] ## applicant_name = row[9] ## number_of_stories = row[10] ## number_of_units = row[11] #check if this is one we want to skip if conversions.has_key(address.upper()): address = conversions[address.upper()] ## if (not status in ['EXPIRED', 'CLOSED']) and (permit_type in ['RENTAL']): #make sure it's not one we're skipping: if not address: print "SKIPPING ITEM: %s" % row[1] skips += 1 else: #check if we've started processing any results for this row if locations.has_key(address.upper()): location = locations[address.upper()] else: location = Location() #temporarily just want to look at google again #location.sources = ["google"] #location.sources = ["google", "bing"] #location.sources = ["google", "bing", "usgeo", "geonames", "openmq"] #skip geocoding for columbia location.sources = [] #do some geocoding, as needed: search = "%s, %s, %s" % (address.upper(), city_name, city.state) any_updated = False for geo_source in location.sources: update = geo.lookup(search, geo_source, location, force=True) #update = geo.lookup(search, geo_source, location, force=False) if update: any_updated = True location.sources = [ 'csv', "google", "bing", "usgeo", "geonames", "openmq", "mq" ] #manually add data from csv here: result = [] result.append({'place': address, 'lat': lat, 'lng': lng}) setattr(location, 'csv', result) #this is the case for brand new searches #(which are updated in a different sense) if not hasattr(location, "address_alt") or not location.address_alt: any_updated = True location.address_alt = search #location.bldg_units = bldg_units #location.units_bdrms = units_bdrms locations[address.upper()] = location #handle the database storage bldg = make_building(location, bldg_id, city, feed_source, no_units=no_units, bldg_type=bldg_type) if apt_main: unit = make_unit(apt_main, bldg) (person, bldg_person) = make_person(owner_name, bldg, "Agent", address=owner_address) if any_updated: #back it up for later #enable this when downloading GPS coordinates... #the rest of the time it slows things down local_cache['buildings'] = {} for key, value in locations.items(): local_cache['buildings'][key] = value.to_dict() save_json(cache_destination, local_cache) print #exit() destination = '%s.tsv' % city_tag save_results(locations, destination)
def read_csv(source_csv, city_name, city_tag): city_options = City.objects.filter(tag=city_tag) print "Number of cities available: %s" % len(city_options) if not len(city_options): raise ValueError, "CITY NOT FOUND! run make_cities.py first" ## city = City() ## city.name = city_name ## city.tag = to_tag(city.name) ## city.save() else: city = city_options[0] print city feed_date = "2013-10-16" feeds = FeedInfo.objects.filter(city=city).filter(added=feed_date) if feeds.exists(): feed = feeds[0] print "Already had feed: %s, %s" % (feed.city, feed.added) else: feed = FeedInfo() feed.city = city feed.added = feed_date feed.version = "0.1" feed.save() print "Created new feed: %s" % feed.city.name people = Person.objects.filter(name="Blank") if people.exists(): person = people[0] print "Already had person: %s" % (person.name) else: person = Person() person.name = "Blank" person.save() print "Created new person: %s" % person.name sources = Source.objects.filter(feed=feed) if sources.exists(): feed_source = sources[0] print "Already had source: %s, %s" % (feed_source.feed.city, feed_source.feed.added) else: feed_source = Source() feed_source.feed = feed feed_source.person = person feed_source.save() print "Created new source: %s" % feed_source.feed.city.name cache_file = "%s.json" % city.tag cache_destination = os.path.join(os.path.dirname(source_csv), cache_file) #keep a local copy of data we've processed... #this should help with subsequent calls #to make sure we don't need to duplicate calls to remote geolocation APIs: local_cache = load_json(cache_destination, create=True) if not local_cache.has_key('buildings'): local_cache['buildings'] = {} if not local_cache.has_key('parcels'): local_cache['parcels'] = {} locations = {} for key, value in local_cache['buildings'].items(): locations[key] = Location(value) #geocoder helper: geo = Geo() skips = 0 #with codecs.open(source_csv, 'rb', encoding='utf-8') as csvfile: with open(source_csv) as csvfile: #reader = csv.reader(csvfile, delimiter=' ', quotechar='|') #reader = csv.reader(csvfile) #reader = unicodecsv.UnicodeReader(csvfile, encoding='utf-8') reader = unicode_csv_reader(csvfile) #just print the first row: print '>, <'.join(reader.next()) count = 0 #want to randomize the order... distribute options more evenly #print len(reader) #exit() #in order to randomize, should randomize the order in the csv for row in reader: count += 1 print "Looking at row: %s" % count #could exit out early here, if needed if count > 10: #exit() pass print row address = row[0] ## no_units = row[12] #can pass this in as bldg_id to make_building #that gets used for parcel too parcel_id = row[1] bldg_id = parcel_id street_num = row[2] street_dir = row[3] street_name = row[4] street_sfx = row[5] #eg building number qualifier_pre = row[6] #eg "UNIT" or "APT" qualifier_post = row[7] apt_num = row[8] #skip row9 (in/out... whatever that means) zip_code = row[10] #skip row11, assessor id #skip row12, address num #skip row13, x #skip row14, y #xcoord == lng lng = row[15] lat = row[16] #entry floor number: (named 'z' in sheet) floor = row[17] #skip row18, strcid... not sure #skip row19, parent #skip row20, app_ #skip row21, hteloc zone = row[22] bldg_type = row[23] #number of buildings bldg_num = row[24] no_units = row[25] #skip row[26], inspection type #skip row27, app number #skip row28, date received #skip row29, application type #skip row30, ownerid #skip row31, operator id #skip row32, agent_id #skip row33, mail to central_heat = row[34] if central_heat == 'Y': central_heat = True else: central_heat = False #heat mechanism? heat mechanic??? not sure heat_mech = row[35] #skip row36, agent id (2) #skip row37, agent last name #skip row38 agent first name #skip row39 agent middle initial #skip row40, agent title #skip row41, business name #could be owner, could be agent owner_name = row[42] owner_address1 = row[43] owner_address2 = row[44] owner_city = row[45] owner_state = row[46] owner_zip = row[47] #address = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre, qualifier_post, apt_num]) address_main = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre]) address_main = address_main.strip() #get rid of any double spaces address_main = address_main.replace(" ", " ") apt_main = " ".join([qualifier_post, apt_num]) apt_main = apt_main.strip() address = address_main print address owner_address = ", ".join([owner_address1, owner_address2, owner_city, owner_state, owner_zip]) ## #should always be "RENTAL" (don't need to track this one) ## permit_type = row[1] ## if not permit_type == "RENTAL" and not permit_type == "MECHANICAL": ## raise ValueError, "Unexpected permit type: %s in row: %s" % ( ## permit_type, row) ## bldg_type = row[2] ## #can use this to filter out non-rental or obsolete entries ## #don't need to track otherwise: ## status = row[3] ## parcel_id = row[4] ## #should be fixed per source: ## ss_city = row[6] ## bldg_sf = row[7] ## no_bldgs = row[8] ## applicant_name = row[9] ## no_stories = row[10] ## no_units = row[11] ## sqft = row[7] ## number_of_buildings = row[8] ## applicant_name = row[9] ## number_of_stories = row[10] ## number_of_units = row[11] #check if this is one we want to skip if conversions.has_key(address.upper()): address = conversions[address.upper()] ## if (not status in ['EXPIRED', 'CLOSED']) and (permit_type in ['RENTAL']): #make sure it's not one we're skipping: if not address: print "SKIPPING ITEM: %s" % row[1] skips += 1 else: #check if we've started processing any results for this row if locations.has_key(address.upper()): location = locations[address.upper()] else: location = Location() #temporarily just want to look at google again #location.sources = ["google"] #location.sources = ["google", "bing"] #location.sources = ["google", "bing", "usgeo", "geonames", "openmq"] #skip geocoding for columbia location.sources = [] #do some geocoding, as needed: search = "%s, %s, %s" % (address.upper(), city_name, city.state) any_updated = False for geo_source in location.sources: update = geo.lookup(search, geo_source, location, force=True) #update = geo.lookup(search, geo_source, location, force=False) if update: any_updated = True location.sources = ['csv', "google", "bing", "usgeo", "geonames", "openmq", "mq"] #manually add data from csv here: result = [] result.append({'place': address, 'lat': lat, 'lng': lng}) setattr(location, 'csv', result) #this is the case for brand new searches #(which are updated in a different sense) if not hasattr(location, "address_alt") or not location.address_alt: any_updated = True location.address_alt = search #location.bldg_units = bldg_units #location.units_bdrms = units_bdrms locations[address.upper()] = location #handle the database storage bldg = make_building(location, bldg_id, city, feed_source, no_units=no_units, bldg_type=bldg_type) if apt_main: unit = make_unit(apt_main, bldg) (person, bldg_person) = make_person(owner_name, bldg, "Agent", address=owner_address) if any_updated: #back it up for later #enable this when downloading GPS coordinates... #the rest of the time it slows things down local_cache['buildings'] = {} for key, value in locations.items(): local_cache['buildings'][key] = value.to_dict() save_json(cache_destination, local_cache) print #exit() destination = '%s.tsv' % city_tag save_results(locations, destination)
# LOAD THE TRANSFER MODEL # can do thi only at first execution # #from tensorflow.python.keras.applications import VGG16 #from tensorflow.python.keras.models import Model #image_model = VGG16(include_top=True, weights='imagenet') #transfer_layer=image_model.get_layer('fc2') #image_model_transfer = Model(inputs=image_model.input, # outputs=transfer_layer.output) transfer_values_train = np.load( 'image_features/transfer_values/InceptionV3/transfer_values_train.npy') transfer_values_test = np.load( 'image_features/transfer_values/InceptionV3/transfer_values_test.npy') captions_train = load_json('captions_train') filename = 'InceptionCaptions/5_beamsearched.json' out_dir = 'best_beamsearched/InceptionV3/' with open(filename, 'r') as inFile: beamCaptions = json.load(inFile) beamCaptions = tuple(beamCaptions) def get_transfer_values(image_path): tv_len = transfer_values_test[0].shape[0] filename = image_path[len(image_dir):] for i in range(len(filenames_test)): if filenames_test[i] == filename: break
def test_migrate_record(app, location, datadir, es): """Test migrate date.""" # create the project data = load_json(datadir, 'cds_records_demo_1_project.json') dump = CDSRecordDump(data=data[0]) project = CDSRecordDumpLoader.create(dump=dump) p_id = project.id date = '2015-11-13' assert project['$schema'] == Project.get_record_schema() assert project['date'] == date assert project['publication_date'] == date assert 'license' not in project assert 'copyright' not in project assert project['_cds'] == { "state": { "file_transcode": "SUCCESS", "file_video_extract_frames": "SUCCESS", "file_video_metadata_extraction": "SUCCESS" }, 'modified_by': None, } # check project deposit deposit_project_uuid = PersistentIdentifier.query.filter_by( pid_type='depid', object_type='rec').one().object_uuid deposit_project = Record.get_record(deposit_project_uuid) assert Project._schema in deposit_project['$schema'] assert project.revision_id == deposit_project[ '_deposit']['pid']['revision_id'] assert deposit_project['_deposit']['created_by'] == -1 assert deposit_project['_deposit']['owners'] == [-1] assert deposit_project['_files'] == [] # create the video data = load_json(datadir, 'cds_records_demo_1_video.json') dump = CDSRecordDump(data=data[0]) def load_video(*args, **kwargs): return open(join(datadir, 'test.mp4'), 'rb') with mock.patch.object(DataCiteProvider, 'register') as mock_datacite, \ mock.patch.object( CDSRecordDumpLoader, '_get_migration_file_stream', return_value=load_video()): video = CDSRecordDumpLoader.create(dump=dump) # assert mock_datacite.called is True project = Record.get_record(p_id) assert project['videos'] == [ {'$ref': 'https://cds.cern.ch/api/record/1495143'} ] assert video['$schema'] == Video.get_record_schema() date = '2012-11-20' assert video['date'] == date assert video['publication_date'] == date assert video['_project_id'] == '2093596' assert video['license'] == [{ 'license': 'CERN', 'url': 'http://copyright.web.cern.ch', }] assert video['copyright'] == { 'holder': 'CERN', 'year': '2012', 'url': 'http://copyright.web.cern.ch', } assert video['description'] == '' assert 'doi' in video assert video['_cds']['state'] == { "file_transcode": "SUCCESS", "file_video_extract_frames": "SUCCESS", "file_video_metadata_extraction": "SUCCESS" } assert 'extracted_metadata' in video['_cds'] def check_files(video): bucket = CDSRecordDumpLoader._get_bucket(record=video) files = [dump_object(obj) for obj in ObjectVersion.get_by_bucket(bucket=bucket)] for file_ in files: assert as_bucket(file_['bucket_id']) is not None assert 'checksum' in file_ assert 'content_type' in file_ assert 'context_type' in file_ assert FileInstance.query.filter_by( id=file_['file_id']) is not None assert 'key' in file_ assert 'links' in file_ assert 'content_type' in file_ assert 'context_type' in file_ assert 'media_type' in file_ assert 'tags' in file_ # check extracted metadata master_video = CDSVideosFilesIterator.get_master_video_file(video) assert any([key in master_video['tags'] for key in ExtractMetadataTask._all_keys]) assert any([key in video['_cds']['extracted_metadata'] for key in ExtractMetadataTask._all_keys]) def check_buckets(record, deposit): def get(key, record): bucket = CDSRecordDumpLoader._get_bucket(record=record) files = [dump_object(obj) for obj in ObjectVersion.get_by_bucket(bucket=bucket)] return [file_[key] for file_ in files] def check(record, deposit, file_key, different=None): values_record = set(get(file_key, record)) values_deposit = set(get(file_key, deposit)) difference = len(values_record - values_deposit) assert different == difference def check_tag_master(record): bucket = CDSRecordDumpLoader._get_bucket(record=record) master = CDSVideosFilesIterator.get_master_video_file(record) files = [dump_object(obj) for obj in ObjectVersion.get_by_bucket(bucket=bucket) if obj.get_tags().get('master')] assert all([file_['tags']['master'] == master['version_id'] for file_ in files]) # 1 bucket record != 1 bucket deposit check(record, deposit, 'bucket_id', 1) # all file_id are the same except the smil file (only in record) check(record, deposit, 'file_id', 1) check(record, deposit, 'key', 1) # 18 object_version record != 17 object_version deposit check(record, deposit, 'version_id', 18) # check tag 'master' where is pointing check_tag_master(record) check_tag_master(deposit) def check_first_level_files(record): [master] = [file_ for file_ in deposit_video['_files'] if file_['context_type'] == 'master'] assert len(master['subformat']) == 5 assert len(master['frame']) == 10 # TODO assert len(master['playlist']) == ?? assert len([file_ for file_ in deposit_video['_files'] if file_['context_type'] == 'master']) == 1 duration = float(record['_cds']['extracted_metadata']['duration']) for frame in master['frame']: assert float(frame['tags']['timestamp']) < duration assert float(frame['tags']['timestamp']) > 0 # check video deposit deposit_video_uuid = PersistentIdentifier.query.filter( PersistentIdentifier.pid_type == 'depid', PersistentIdentifier.object_uuid != str(deposit_project_uuid), PersistentIdentifier.object_type == 'rec' ).one().object_uuid deposit_video = Video.get_record(str(deposit_video_uuid)) assert Video._schema in deposit_video['$schema'] assert video.revision_id == deposit_video[ '_deposit']['pid']['revision_id'] assert deposit_video['_deposit']['created_by'] == -1 assert deposit_video['_deposit']['owners'] == [-1] assert len(video['_files']) == 2 assert len(deposit_video['_files']) == 2 check_files(video) check_files(deposit_video) check_buckets(video, deposit_video) check_first_level_files(video) check_first_level_files(deposit_video) # try to edit video deposit_video = deposit_video_resolver(deposit_video['_deposit']['id']) deposit_video = deposit_video.edit() # try to edit project deposit_project = deposit_project_resolver( deposit_project['_deposit']['id']) deposit_project = deposit_project.edit() # try to publish again the video deposit_video['title']['title'] = 'test' deposit_video = deposit_video.publish() _, record_video = deposit_video.fetch_published() assert record_video['title']['title'] == 'test'
def read_csv(source_csv, city_name, city_tag): city_options = City.objects.filter(tag=city_tag) print "Number of cities available: %s" % len(city_options) if not len(city_options): raise ValueError, "CITY NOT FOUND! run make_cities.py first" ## city = City() ## city.name = city_name ## city.tag = to_tag(city.name) ## city.save() else: city = city_options[0] print city feed_date = "2013-10-16" feeds = FeedInfo.objects.filter(city=city).filter(added=feed_date) if feeds.exists(): feed = feeds[0] print "Already had feed: %s, %s" % (feed.city, feed.added) else: feed = FeedInfo() feed.city = city feed.added = feed_date feed.version = "0.1" feed.save() print "Created new feed: %s" % feed.city.name people = Person.objects.filter(name="Blank") if people.exists(): person = people[0] print "Already had person: %s" % (person.name) else: person = Person() person.name = "Blank" person.save() print "Created new person: %s" % person.name sources = Source.objects.filter(feed=feed) if sources.exists(): feed_source = sources[0] print "Already had source: %s, %s" % (feed_source.feed.city, feed_source.feed.added) else: feed_source = Source() feed_source.feed = feed feed_source.person = person feed_source.save() print "Created new source: %s" % feed_source.feed.city.name cache_file = "%s-20150525.json" % city.tag cache_destination = os.path.join(os.path.dirname(source_csv), cache_file) # keep a local copy of data we've processed... # this should help with subsequent calls # to make sure we don't need to duplicate calls to remote geolocation APIs: local_cache = load_json(cache_destination, create=True) if not local_cache.has_key("buildings"): local_cache["buildings"] = {} search_results = {} for key, value in local_cache["buildings"].items(): # search_results[key] = Location(value) sr = SearchResults() sr.from_dict(value) # print # print sr # print search_results[key] = sr # geocoder helper: # geo = Geo() skips = 0 with open(source_csv) as csvfile: reader = unicode_csv_reader(csvfile) # just print the first row: print ">, <".join(reader.next()) count = 0 # want to randomize the order... distribute options more evenly # print len(reader) # exit() # in order to randomize, should randomize the order in the csv for row in reader: count += 1 print "Looking at row: %s" % count any_updated = False # could exit out early here, if needed if count > 10: # exit() pass # if you want to skip ahead more quickly: if count < 27187: pass else: # print row objectid = row[0] ## no_units = row[12] # can pass this in as bldg_id to make_building # that gets used for parcel too parcel_id = row[1] bldg_id = parcel_id street_num = row[2] street_dir = row[3] street_name = row[4] street_sfx = row[5] # eg building number qualifier_pre = row[6] # eg "UNIT" or "APT" qualifier_post = row[7] apt_num = row[8] # skip row9 (in/out... whatever that means) zip_code = row[10] # skip row11, assessor id # skip row12, address num # skip row13, x # skip row14, y # xcoord == lng lng = row[15] lat = row[16] # entry floor number: (named 'z' in sheet) floor = row[17] # skip row18, strcid... not sure # skip row19, parent # skip row20, app_ # skip row21, hteloc zone = row[22] bldg_type = row[23] # number of buildings bldg_num = row[24] no_units = row[25] # skip row[26], inspection type # skip row27, app number # skip row28, date received # skip row29, application type # skip row30, ownerid # skip row31, operator id # skip row32, agent_id # skip row33, mail to central_heat = row[34] if central_heat == "Y": central_heat = True else: central_heat = False # heat mechanism? heat mechanic??? not sure heat_mech = row[35] # skip row36, agent id (2) # skip row37, agent last name # skip row38 agent first name # skip row39 agent middle initial # skip row40, agent title # skip row41, business name # could be owner, could be agent owner_name = row[42] owner_address1 = row[43] owner_address2 = row[44] owner_city = row[45] owner_state = row[46] owner_zip = row[47] # address = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre, qualifier_post, apt_num]) # this is causing problems with lookups in google if ( qualifier_pre == "DUP" or qualifier_pre == "DUPE" or qualifier_pre == "2-Jan" or qualifier_pre == "HM" or qualifier_pre == "DWN" ): qualifier_pre = "" address_main = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre]) address_main = address_main.strip() # get rid of any double spaces address_main = address_main.replace(" ", " ") # similar to conversions, # but there are too many of these to list there if re.search("HOLLY RIDGE LN", address_main): address_main = address_main.replace("HOLLY RIDGE LN", "HOLLYRIDGE LN") if re.search("BERKSHIRE CT", address_main): address_main = address_main.replace("BERKSHIRE CT", "BERKSHIRE") # address_main = '' if re.search("CAMERON CT", address_main): address_main = address_main.replace("CAMERON CT", "CAMERON") # address_main = '' if re.search("ATHENS CT", address_main): address_main = address_main.replace("ATHENS CT", "ATHENS") # address_main = '' if re.search("LAMAR CT", address_main): address_main = address_main.replace("LAMAR CT", "LAMAR") # address_main = '' if re.search("MONITEAU CT", address_main): address_main = address_main.replace("MONITEAU CT", "MONITEAU") # address_main = '' if re.search("IMPERIAL CT", address_main): address_main = "" if re.search("PERKINS DR", address_main): address_main = "" if re.search("GRANITE OAKS CT", address_main): address_main = "" # sometimes the 'BLDG' data is added in the wrong place # then it gets treated as a unit item # (but it's not *always* a unit item, so can't generalize it that way) if qualifier_post == "BLDG" or qualifier_post == "LOT": address_main = " ".join([address_main, qualifier_post, apt_main]) address_main = address_main.strip() apt_main = "" else: apt_main = " ".join([qualifier_post, apt_num]) apt_main = apt_main.strip() # check if this is one we want to skip if conversions.has_key(address_main.upper()): address_main = conversions[address_main.upper()] if address_main: print "APT_MAIN: ", apt_main address = ", ".join([address_main, apt_main]) else: address = "" owner_address = ", ".join([owner_address1, owner_address2, owner_city, owner_state, owner_zip]) ## if (not status in ['EXPIRED', 'CLOSED']) and (permit_type in ['RENTAL']): print "Parcel ID:", parcel_id print address results = None # make sure it's not one we're skipping: if not address: print "SKIPPING ITEM: %s" % row[1] skips += 1 skipf = codecs.open("skips.txt", "a", encoding="utf-8") original = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre]) skipf.write(original) skipf.write("\n") skipf.close() else: # check if we've started processing any results for this row if search_results.has_key(address.upper()): print "Already had building: %s" % address results = search_results[address.upper()] # print results else: addy = ", ".join([address_main, city.name, city.state]) addy += " " + zip_code # addy += ", USA" print addy # toggle betweeen an actual google query results = address_search(addy, apt_main) # print dir(results) if len(results.matches) > 1: print results for option in results.matches: print "%s: %s, %s" % (option["place"], option["lat"], option["lng"]) print print "Source Lat: %s, Lng: %s" % (lat, lng) src_lat = int(float(lat) * 100) src_lng = int(float(lng) * 100) matched = False for current in results.matches: # current = results.matches[0] print current["lat"] print current["lng"] # only want to look at the first 2 decimal places: comp_lat = int(float(current["lat"]) * 100) comp_lng = int(float(current["lng"]) * 100) print comp_lat print comp_lng if (src_lat == comp_lat) and (src_lng == comp_lng): # results.matches = results.matches[:1] results.matches = [current] matched = True if not matched: print "DIDN'T MATCH!" exit() any_updated = True # or just using results as specified in csv # (THIS DOES NOT NORMALIZE THE ADDRESS VIA GOOGLE) # results = SearchResults() # results.unit_text = apt_main # handle_place(results, addy, lat, lng, apt_main) assert results # print results lookup_building_with_geo(results, make=True, parcel_id=parcel_id) # print results # current['results'] = results # print results if results.errors: print results raise ValueError, results.errors else: search_results[address.upper()] = results bldg = results.building assert bldg unit = results.unit # may be a case where the unit is blank # and another unit with an number/letter was created earlier # in that case, we won't be creating one here # and the building will already exist... # not necessarily an error though # just redundant data # assert unit (person, bldg_person) = make_person(owner_name, bldg, "Agent", address=owner_address) # time.sleep(1) if any_updated: # back it up for later # enable this when downloading GPS coordinates... # the rest of the time it slows things down local_cache["buildings"] = {} for key, value in search_results.items(): # search_results[key] = SearchResults().from_dict(value) local_cache["buildings"][key] = value.to_dict() save_json(cache_destination, local_cache) print
loss_mean = tf.reduce_mean(loss) return loss_mean # This code is used to generate captions # the CNN model, as well as the image size, has to be specified image_model = VGG16(include_top=True, weights='imagenet') transfer_layer=image_model.get_layer('fc2') image_model_transfer = Model(inputs=image_model.input, outputs=transfer_layer.output) img_size=K.int_shape(image_model.input)[1:3] # recreate the tokenizer mark_start='ssss ' mark_end=' eeee' captions_train=load_json('captions_train_saifullah') captions_train_marked=mark_captions(captions_train) captions_train_flat=flatten(captions_train_marked) tokenizer=TokenizerWrap(texts=captions_train_flat, num_words=167) token_start=tokenizer.word_index[mark_start.strip()] token_end=tokenizer.word_index[mark_end.strip()] tokens_train=tokenizer.captions_to_tokens(captions_train_marked) def generate_caption(image_path, max_tokens=30): """ Generate a caption for the image in the given path. The caption is limited to the given number of tokens (words). """ # Load and resize the image.
def read_csv(source_csv, city_name, city_tag, driver): city_options = City.objects.filter(tag=city_tag) print "Number of cities available: %s" % len(city_options) if not len(city_options): raise ValueError, "CITY NOT FOUND! run make_cities.py first" ## city = City() ## city.name = city_name ## city.tag = to_tag(city.name) ## city.save() else: city = city_options[0] print city position_file = "position.json" position = load_json(position_file, create=True) if not position: position = 0 cache_file = "%s-20150525.json.bkup" % city.tag cache_destination = os.path.join(os.path.dirname(source_csv), cache_file) #keep a local copy of data we've processed... #this should help with subsequent calls #to make sure we don't need to duplicate calls to remote geolocation APIs: local_cache = load_json(cache_destination, create=True) if not local_cache.has_key('buildings'): local_cache['buildings'] = {} search_results = {} for key, value in local_cache['buildings'].items(): #search_results[key] = Location(value) sr = SearchResults() sr.from_dict(value) #print #print sr #print search_results[key] = sr #geocoder helper: #geo = Geo() provider = '' provider_options = ServiceProvider.objects.filter(name='City of Columbia') if len(provider_options): provider = provider_options[0] else: raise ValueError, "error finding utility_provider: %s matches" % len(provider_options) skips = 0 with open(source_csv) as csvfile: reader = unicode_csv_reader(csvfile) #just print the first row: print '>, <'.join(reader.next()) count = 0 #want to randomize the order... distribute options more evenly #print len(reader) #exit() #in order to randomize, should randomize the order in the csv for row in reader: count += 1 print "Looking at row: %s, position: %s" % (count, position) start = datetime.now() print "Started: ", start any_updated = False #could exit out early here, if needed if count > 10: #exit() pass #if you want to skip ahead more quickly: #if count < 0: if count < position: pass else: #print row objectid = row[0] ## no_units = row[12] #can pass this in as bldg_id to make_building #that gets used for parcel too parcel_id = row[1] bldg_id = parcel_id street_num = row[2] street_dir = row[3] street_name = row[4] street_sfx = row[5] #eg building number qualifier_pre = row[6] #eg "UNIT" or "APT" qualifier_post = row[7] apt_num = row[8] #skip row9 (in/out... whatever that means) zip_code = row[10] #skip row11, assessor id #skip row12, address num #skip row13, x #skip row14, y #xcoord == lng lng = row[15] lat = row[16] #entry floor number: (named 'z' in sheet) floor = row[17] #skip row18, strcid... not sure #skip row19, parent #skip row20, app_ #skip row21, hteloc zone = row[22] bldg_type = row[23] #number of buildings bldg_num = row[24] no_units = row[25] #skip row[26], inspection type #skip row27, app number #skip row28, date received #skip row29, application type #skip row30, ownerid #skip row31, operator id #skip row32, agent_id #skip row33, mail to central_heat = row[34] if central_heat == 'Y': central_heat = True else: central_heat = False #heat mechanism? heat mechanic??? not sure heat_mech = row[35] #skip row36, agent id (2) #skip row37, agent last name #skip row38 agent first name #skip row39 agent middle initial #skip row40, agent title #skip row41, business name #could be owner, could be agent ## owner_name = row[42] ## owner_address1 = row[43] ## owner_address2 = row[44] ## owner_city = row[45] ## owner_state = row[46] ## owner_zip = row[47] #address = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre, qualifier_post, apt_num]) #this is causing problems with lookups in google if qualifier_pre == "DUP" or qualifier_pre == "DUPE" or qualifier_pre == "2-Jan" or qualifier_pre == "HM" or qualifier_pre == "DWN": qualifier_pre = '' address_main = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre]) address_main = address_main.strip() #get rid of any double spaces address_main = address_main.replace(" ", " ") #similar to conversions, #but there are too many of these to list there if re.search('HOLLY RIDGE LN', address_main): address_main = address_main.replace('HOLLY RIDGE LN', 'HOLLYRIDGE LN') if re.search('BERKSHIRE CT', address_main): address_main = address_main.replace('BERKSHIRE CT', 'BERKSHIRE') #address_main = '' if re.search('CAMERON CT', address_main): address_main = address_main.replace('CAMERON CT', 'CAMERON') #address_main = '' if re.search('ATHENS CT', address_main): address_main = address_main.replace('ATHENS CT', 'ATHENS') #address_main = '' if re.search('LAMAR CT', address_main): address_main = address_main.replace('LAMAR CT', 'LAMAR') #address_main = '' if re.search('MONITEAU CT', address_main): address_main = address_main.replace('MONITEAU CT', 'MONITEAU') #address_main = '' if re.search('IMPERIAL CT', address_main): address_main = '' if re.search('PERKINS DR', address_main): address_main = '' if re.search('GRANITE OAKS CT', address_main): address_main = '' #sometimes the 'BLDG' data is added in the wrong place #then it gets treated as a unit item #(but it's not *always* a unit item, so can't generalize it that way) if qualifier_post == "BLDG" or qualifier_post == "LOT": address_main = " ".join([address_main, qualifier_post, apt_main]) address_main = address_main.strip() apt_main = '' else: apt_main = " ".join([qualifier_post, apt_num]) apt_main = apt_main.strip() #check if this is one we want to skip if conversions.has_key(address_main.upper()): address_main = conversions[address_main.upper()] if address_main: print "APT_MAIN: ", apt_main address = ", ".join( [address_main, apt_main] ) ## if (not status in ['EXPIRED', 'CLOSED']) and (permit_type in ['RENTAL']): print "Parcel ID:", parcel_id print address results = None #make sure it's not one we're skipping: if not address: print "SKIPPING ITEM: %s" % row[1] skips += 1 ## skips = codecs.open("skips.txt", 'a', encoding='utf-8') ## original = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre]) ## skips.write(original) ## skips.write('\n') ## skips.close() #check if we've started processing any results for this row elif not search_results.has_key(address.upper()): print "No saved search results for address: %s" % address print "Skipping." print #raise ValueError, "No results found for %s" % address else: print "Already had building: %s" % address results = search_results[address.upper()] assert results #print results lookup_building_with_geo(results, make=True, parcel_id=parcel_id) #print results #current['results'] = results #print results if results.errors: print results raise ValueError, results.errors else: bldg = results.building assert bldg unit = results.unit #at this point there should be at least one unit #and we will want to associate results with that unit #assert unit # can just pass this up in this case if not unit: print "Skipping address... no matching Unit!" else: #now that we have a building #look up energy data on the remote website #result = urllib2.urlopen("http://example.com/foo/bar") #print result.read() ## base = "http://www.gocolumbiamo.com/cfforms/ub/rental.html" ## driver.get(base) ## search = driver.find_element_by_css_selector('#address') ## search.send_keys(address) ## button = driver.find_element_by_css_selector('.ui-bar > a:nth-child(2)') ## #button = driver.find_element_by_css_selector('#PrimaryCenterColumn > div > div.ui-bar-b.ui-header > div > a.ui-btn.ui-btn-corner-all.ui-shadow.ui-btn-down-b.ui-btn-up-b') ## #button = driver.find_element_by_css_selector('#PrimaryCenterColumn > div > div.ui-bar-b.ui-header > div > a.ui-btn.ui-btn-corner-all.ui-shadow.ui-btn-down-b.ui-btn-up-b > span > span') ## button.click() ## time.sleep(4) ## #results = driver.find_element_by_css_selector('.dojoxGridMasterView') ## results = driver.find_element_by_css_selector('.dojoxGridContent > div:nth-child(1)') ## print results.get_attribute('innerHTML') ## print parcel_id ## options = results.find_elements_by_tag_name('div') ## #options = results.find_elements_by_link_text(parcel_id) ## print options ## #something didn't work with this: ## #look_for = '<td tabindex="-1" role="gridcell" colspan="1" class="dojoxGridCell" idx="0" style="width:90px;">%s</td>' % parcel_id ## look_for = '>%s<' % parcel_id ## matches = [] ## for option in options: ## markup = option.get_attribute('innerHTML') ## #print markup ## if re.search(look_for, markup): ## matches.append(option) ## #print "MATCH!" ## if len(matches) > 1: ## print matches ## raise ValueError, "Too many matches!" ## else: ## matches[0].click() #just realized that this form uses the property_id #which we already have... #can skip the steps above that are trying to make this link: base = "http://www.gocolumbiamo.com/cfforms/ub/ubdata.cfm?LOCID=%s&AppNum=79" % parcel_id driver.get(base) try: heat_source = driver.find_element_by_css_selector('#PrimaryCenterColumn > table:nth-child(1) > tbody:nth-child(1) > tr:nth-child(3) > td:nth-child(1) > strong:nth-child(1) > font:nth-child(1)') if heat_source.text.strip() == "Heating Source: Gas Heat": bldg.heat_source_details = 'gas' bldg.save() else: print heat_source.text exit() #TODO: bldg.heat_source_details = 'electric' bldg.who_pays_gas = 'not_available' except: print "heat source not found... skipping" try: selector = driver.find_element_by_css_selector('#el_table_length > label:nth-child(1) > select:nth-child(1) > option:nth-child(3)') selector.click() except: print "No Water data available... skipping" else: body = driver.find_element_by_css_selector('#el_table > tbody:nth-child(3)') rows = body.find_elements_by_tag_name('tr') #row = rows[0] query = bldg.utilitysummary_set.filter(type='electricity') for row in rows: #print row.get_attribute('innerHTML') cols = row.find_elements_by_tag_name('td') date = cols[0].text + '-01' cost = cols[1].text.replace('$', '').strip() amount = cols[2].text amount = amount.replace(' KWH', '') update_summary(query, date, cost, amount, bldg, unit, provider, 'electricity', 'kwh') #update_summary(query, date, cost, amount) #for item in cols: # print item.text #print dir(bldg) #print bldg.utilitysummary_set #query = bldg.utilitysummary_set.filter(type=utility_type[0]) #could look up type from UTILITY_TYPES... #but in this case we know what they should be #query = bldg.utilitysummary_set.filter(type='water') #if len(query): try: water = driver.find_element_by_css_selector('#ext-gen23') water.click() selector = driver.find_element_by_css_selector('#wr_table_length > label:nth-child(1) > select:nth-child(1) > option:nth-child(3)') selector.click() except: print "No Water data available... skipping" else: body = driver.find_element_by_css_selector('#wr_table > tbody:nth-child(3)') rows = body.find_elements_by_tag_name('tr') #row = rows[0] query = bldg.utilitysummary_set.filter(type='water') for row in rows: #print row.get_attribute('innerHTML') cols = row.find_elements_by_tag_name('td') date = cols[0].text + '-01' cost = cols[1].text.replace('$', '').strip() amount = cols[2].text amount = amount.replace(' CCF', '') update_summary(query, date, cost, amount, bldg, unit, provider, 'water', 'ccf') #update_summary(query, date, cost, amount) #for item in cols: # print item.text unit.update_averages() #see if we have enough info now to make a score: unit.update_energy_score() #now that we've saved the unit, #update the averages for the whole building: unit.building.update_utility_averages() unit.building.update_rent_details() position += 1 save_json(position_file, position) if any_updated: #back it up for later #enable this when downloading GPS coordinates... #the rest of the time it slows things down local_cache['buildings'] = {} for key, value in search_results.items(): #search_results[key] = SearchResults().from_dict(value) local_cache['buildings'][key] = value.to_dict() save_json(cache_destination, local_cache) position = count save_json(position_file, position) exit() end = datetime.now() print "finished: ", end total_time = end - start print total_time print
# This code computes the bleu score for the candidate sentence import nltk import math from nltk import word_tokenize from nltk.translate.bleu_score import sentence_bleu from nltk.translate.bleu_score import corpus_bleu from nltk.translate.bleu_score import SmoothingFunction import json from helpers import load_json #from NN_architecture import generate_caption # run the NN architecture before captions_test = load_json('captions_test_saifullah') ##generate_caption(path+filenames_test[0]) ##with open('generated_captions_VGG19.txt') as inFile: #with open('captions_vgg16/4_generated_captions_VGG16.txt') as inFile: # generated_test_captions=inFile.readlines() #for i in range(len(generated_test_captions)): ## THIS LINE REMOVES THE FIRST EMPTY SPACE ## generated_test_captions[i]=generated_test_captions[i][1:] # generated_test_captions[i]=generated_test_captions[i].replace('\n','') # # load from json with open('InceptionCaptions/9_greedy.json') as inFile: generated_test_captions = json.load(inFile) #c_to_insert=generated_test_captions[883] #generated_test_captions.insert(884,c_to_insert)
from collections import OrderedDict from rwrs import app import helpers MAPS = helpers.load_json(app.config['MAPS_DATA_FILE']) RANKS = helpers.load_json(app.config['RANKS_DATA_FILE']) SQUADMATES_STEPS_XP = 1000 # One squad mate is gained every 1000 XP MAX_SQUADMATES = 10 # Maximum squad mates allowed UNLOCKABLES = { 'vanilla': OrderedDict([ (0, { 'weapons': [ {'image': 'assault_rifles', 'name': 'Assault rifles'}, {'image': 'shotguns', 'name': 'Shotguns'} ], 'equipment': [ {'image': 'riot_shield', 'name': 'Riot shield'} ], 'throwables': [ {'image': 'hand_stun_grenades', 'name': '2 hand/stun grenades'} ] }), (500, { 'weapons': [ {'image': 'bazooka', 'name': 'Bazooka'}, {'image': 'pistols_sd', 'name': 'Silenced pistols'} ], 'equipment': [
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_true, logits=y_pred) # Keras may reduce this across the first axis (the batch) # but the semantics are unclear, so to be sure we use # the loss across the entire 2-rank tensor, we reduce it # to a single scalar with the mean function. loss_mean = tf.reduce_mean(loss) return loss_mean # recreate the tokenizer mark_start='ssss ' mark_end=' eeee' captions_train=load_json('captions_train') captions_train_marked=mark_captions(captions_train) captions_train_flat=flatten(captions_train_marked) tokenizer=TokenizerWrap(texts=captions_train_flat, num_words=2000) token_start=tokenizer.word_index[mark_start.strip()] token_end=tokenizer.word_index[mark_end.strip()] tokens_train=tokenizer.captions_to_tokens(captions_train_marked) filenames_val=load_json('filenames_val') def generate_caption(image_path, max_tokens=30): """ Generate a caption for the image in the given path. The caption is limited to the given number of tokens (words).