def upload_box(mail, api, cmax=None): result, data = mail.search(None, "ALL") if result == 'OK': ids = data[0].split() # Ids is a space separated string. ids.reverse() if cmax is None: cmax = len(ids) else: cmax = min(cmax, len(ids)) i = 0 tfetch = 0 tupload = 0 print('0.0 %', end='') for id in ids[:cmax]: # Fetch raw message. (result, data), elapsed = timed(mail.fetch, id, "(RFC822)") # fetch the email body (RFC822) for the given ID. tfetch += elapsed # Encode. raw = data[0][1] # Raw mail, in bytes. data, cte = _decode(raw) # Upload into API. try: result, elapsed = timed(api.message_insert, 'media', data, cte) tupload += elapsed except ApiError as err: print('\nFailed to upload one message: {}\n'.format(err)) # Progress. i = i+1 percent = i * 100 / cmax print('\r\033[0K{0:.2f} %'.format(percent), end='') print('\r\033[0KDone.\n-- fetch time: {}\n-- upload time {}'.format(tfetch, tupload)) else: print('Could not access the mail box: {}'.format(result))
def run_guarded(self, context): period = context.period if config.log_level == "processes": print() try: for k, v in self.subprocesses: if config.log_level == "processes": print(" *", end=' ') if k is not None: print(k, end=' ') utils.timed(v.run_guarded, context) else: v.run_guarded(context) # print "done." context.simulation.start_console(context) finally: if config.autodump is not None: self._autodump(context) if config.autodiff is not None: self._autodiff(period) if self.purge: self.entity.purge_locals()
def test_graph_components(): @tf.function def comps(spins, iters): return gis.largest_cluster(spins, max_iters=iters) graphs = [ nx.Graph([(0, 1), (0, 2), (1, 2), (2, 3), (3, 6), (5, 4)]), # comp sizes 2, 5 nx.Graph([(0, 1), (0, 2), (1, 2), (3, 6), (5, 4)]), # comp sizes 2, 2, 3 nx.Graph([(0, 1), (0, 2), (1, 2), (2, 5), (3, 4), (5, 4)]), # comp sizes 6, less vertices ] tfgs = [TFGraph(g) for g in graphs] gis = GraphIsing(tfgs, 10, 10) s0 = gis.initial_spins(1.0) with timed('direct'): assert (gis.largest_cluster(s0).numpy() == [5, 3, 6]).all() with timed('comps(3) #1'): assert (comps(s0, tf.constant(3)).numpy() == [5, 3, 6]).all() with timed('comps(16) #1'): assert (comps(s0, tf.constant(16)).numpy() == [5, 3, 6]).all() with timed('comps(2) #1'): assert (comps(s0, tf.constant(2)).numpy() == [3, 3, 6]).all() with timed('comps(2) #2'): assert (comps(s0, tf.constant(2)).numpy() == [3, 3, 6]).all() gis.log_metrics()
def sgd(self, reg=.9, lr_init=1., step=2, n_iters=300000, val_interval=5000): """ when using Gibbs approximation and the current y is passed to the gradient function, this will perform CD-k (i.e., starting the gibbs sampler from the current y, and sampling one example after a burn-in time of k steps, typically 1) """ print '[START] SML/SGD Training\n\nTR/VAL/TE SIZES: %s\n' % self.crf.Ns grad = self.regularize(reg)[1] if reg > 0 else self.grad self.val_loss, self.Ws_val, lr = [], [], lr_init with timed('SML/SGD', self): try: for i in xrange(1, n_iters+1): r = randint(0, self.crf.N_tr - 1) g = grad(self.W_opt, self.crf.X[r], self.crf.Y[r]) self.W_opt -= lr * g print 'Iteration #%s: lr=%s, |grad|=%s' % (i, lr, np.linalg.norm(g)) if step: lr = lr_init * np.power(.1, np.floor(i * (step+1) / n_iters)) if i % val_interval == 0: print '\nCurrent norm: |W| = %s' % np.linalg.norm(self.W_opt) Ws = self.crf.split_W(self.W_opt) with timed('Validation Iter (MAP predict)', skip=''): loss = self.ev(self.crf.Y_v, [self.crf.MAP(x,Ws) for x in self.crf.X_v]) print '\tVAL LOSS: %s\n' % self.ev.get_names(loss) self.val_loss.append(loss) self.Ws_val.append(np.array(self.W_opt)) except KeyboardInterrupt: print '\nINFO - Manually exited train loop at Iteration %s' % i return self.W_opt, self.val_loss, self.Ws_val
def simulate_period(period_idx, period, processes, entities, init=False): print("\nperiod", period) if init: for entity in entities: print(" * %s: %d individuals" % (entity.name, len(entity.array))) else: print("- loading input data") for entity in entities: print(" *", entity.name, "...", end=' ') timed(entity.load_period_data, period) print(" -> %d individuals" % len(entity.array)) for entity in entities: entity.array_period = period entity.array['period'] = period if processes: # build context for this period: const_dict = {'__simulation__': self, 'period': period, 'nan': float('nan'), '__globals__': globals_data} num_processes = len(processes) for p_num, process_def in enumerate(processes, start=1): process, periodicity = process_def print("- %d/%d" % (p_num, num_processes), process.name, end=' ') print("...", end=' ') if period_idx % periodicity == 0: elapsed, _ = gettime(process.run_guarded, self, const_dict) else: elapsed = 0 print("skipped (periodicity)") process_time[process.name] += elapsed if config.show_timings: print("done (%s elapsed)." % time2str(elapsed)) else: print("done.") self.start_console(process.entity, period, globals_data) print("- storing period data") for entity in entities: print(" *", entity.name, "...", end=' ') timed(entity.store_period_data, period) print(" -> %d individuals" % len(entity.array)) # print " - compressing period data" # for entity in entities: # print " *", entity.name, "...", # for level in range(1, 10, 2): # print " %d:" % level, # timed(entity.compress_period_data, level) period_objects[period] = sum(len(entity.array) for entity in entities)
def preprocess(experiment, dataset): with timed("Preprocessing training data"): train_set = list( zip(transpose(normalise(pad(dataset['train']['data'], 4))), dataset['train']['labels'])) with timed("Preprocessing test data"): test_set = list( zip(transpose(normalise(dataset['test']['data'])), dataset['test']['labels'])) return train_set, test_set
def simulate_period(period_idx, period, processes, entities, init=False): print "\nperiod", period if init: for entity in entities: print " * %s: %d individuals" % (entity.name, len(entity.array)) else: print "- loading input data" for entity in entities: print " *", entity.name, "...", timed(entity.load_period_data, period) print " -> %d individuals" % len(entity.array) for entity in entities: entity.array_period = period entity.array["period"] = period if processes: # build context for this period: const_dict = {"period": period, "nan": float("nan"), "__globals__": globals_data} num_processes = len(processes) for p_num, process_def in enumerate(processes, start=1): process, periodicity = process_def print "- %d/%d" % (p_num, num_processes), process.name, # TODO: provide a custom __str__ method for Process & # Assignment instead if hasattr(process, "predictor") and process.predictor and process.predictor != process.name: print "(%s)" % process.predictor, print "...", if period_idx % periodicity == 0: elapsed, _ = gettime(process.run_guarded, self, const_dict) else: elapsed = 0 print "skipped (periodicity)" process_time[process.name] += elapsed if config.show_timings: print "done (%s elapsed)." % time2str(elapsed) else: print "done." self.start_console(process.entity, period, globals_data) print "- storing period data" for entity in entities: print " *", entity.name, "...", timed(entity.store_period_data, period) print " -> %d individuals" % len(entity.array) # print " - compressing period data" # for entity in entities: # print " *", entity.name, "...", # for level in range(1, 10, 2): # print " %d:" % level, # timed(entity.compress_period_data, level) period_objects[period] = sum(len(entity.array) for entity in entities)
def run_guarded(self, simulation, const_dict): print for k, v in self.subprocesses: print " *", if k is not None: print k, utils.timed(v.run_guarded, simulation, const_dict) # print "done." simulation.start_console(v.entity, const_dict["period"], const_dict["__globals__"]) # purge all local variables temp_vars = self.entity.temp_variables all_vars = self.entity.variables local_vars = set(temp_vars.keys()) - set(all_vars.keys()) for var in local_vars: del temp_vars[var]
def aggregate(self, report_id): logbook.info("Get customer usage aggregation for {}", report_id) customer = Customer.get_by_id(report_id.customer_id) if not customer: raise Exception("Customer %s not found" % report_id.customer_id) with timed("get_usage simple"): aggregated_usage = ServiceUsage.get_usage(customer, report_id.start, report_id.end) tariffs = {} services = set() for usage in aggregated_usage: service_id, tariff_id, cost, usage_volume = usage services.add(service_id) if not tariff_id: logbook.error("ServiceUsage {} is not completed. Tariff is not filled", usage) continue tariff = Tariff.get_by_id(tariff_id) tariff_report = tariffs.get(tariff_id) if tariff_report is None: tariff_report = self.tariff_report_type(tariff, customer) tariffs[tariff_id] = tariff_report tariff_report.add_usage(usage) total = Counter() for tariff_id, tariff in tariffs.items(): total_tariff, currency = tariff.aggregate() total[currency] += total_tariff for t, value in total.items(): total[t] = decimal_to_string(value) logbook.info("Aggregated {} for {}. Services: {}", total, customer, services) return self.prepare_result(list(tariffs.values()), total, customer, report_id.start, report_id.end)
def aggregate(self, report_id): logbook.info("Get detailed customer usage aggregation for {}", report_id) customer = Customer.get_by_id(report_id.customer_id) if not customer: raise Exception("Customer %s not found" % report_id.customer_id) with timed("get_usage simple"): aggregated_usage = ServiceUsage.get_detailed_usage( customer, report_id.start, report_id.end) tariffs = {} services = set() for usage in aggregated_usage: tariff = Tariff.get_by_id(usage.tariff_id) tariff_report = tariffs.get(usage.tariff_id) if tariff_report is None: tariff_report = self.tariff_report_type(tariff, customer) tariffs[usage.tariff_id] = tariff_report tariff_report.add_usage(usage) total = Counter() for tariff_id, tariff in tariffs.items(): total_tariff, currency = tariff.aggregate() total[currency] += total_tariff for t, value in total.items(): total[t] = decimal_to_string(value) logbook.info("Aggregated {} for {}. Services: {}", total, customer, services) return self.prepare_result(list(tariffs.values()), total, customer, report_id.start, report_id.end)
def _build_model(self, device=None, batch_size=None): with timed(f"\nBuilding model..."): net = vars(models)[self.config.net] model = Network(union(net(), self.config.losses)) if device: print(f"Transferring model to device gpu:{device}") model = model.to(device) return model
def val(W): print 'current W - obj: %s, norm: %s' % (obj(W), np.linalg.norm(W)) Ws = self.crf.split_W(W) with timed('Validation (MAP predict)', skip=''): loss = self.ev(self.crf.Y_v, [self.crf.MAP(x, Ws) for x in self.crf.X_v]) print '\tVAL LOSS: %s\n' % self.ev.get_names(loss) self.val_loss.append(loss) self.Ws_val.append(np.array(W))
def _preprocess(self, dataset): with timed("\nPreprocessing training data"): train_data, train_labels = dataset['train']['data'], dataset[ 'train']['labels'] if self.config.train_preprocessors: train_data = toolz.pipe(train_data, *self.config.train_preprocessors) train_set = list(zip(train_data, train_labels)) with timed("Preprocessing test data"): test_data, test_labels = dataset['test']['data'], dataset['test'][ 'labels'] if self.config.test_preprocessors: test_data = toolz.pipe(test_data, *self.config.test_preprocessors) test_set = list(zip(test_data, test_labels)) return train_set, test_set
def test_bench(): with tf.device("/device:CPU:0"): N = 1000 K = 1000 g = nx.random_graphs.powerlaw_cluster_graph(N, 5, 0.5) tfg = TFGraph(g) gis = GraphIsing([tfg] * K) s0 = gis.initial_spins(-1.0) s1 = gis.initial_spins(1.0) print("Graphs: {} powerlaw graphs, {} nodes ({} tot nodes)".format( K, N, N * K)) @tf.function def repeat_update(spins, iters): for i in range(iters): spins = gis.update(spins, 0.5) return spins @tf.function def repeat_components(spins): return gis.largest_cluster(spins) @tf.function def repeat_sampled_components(spins, samples): return gis.sampled_largest_cluster(spins, samples=samples) with timed('warmup'): repeat_update(s0, tf.constant(1)) with timed('run 100x updates #1'): repeat_update(s0, tf.constant(100)) with timed('run 100x updates #2'): repeat_update(s0, tf.constant(100)) with timed('warmup'): repeat_components(s1) with timed('run 1x components #1'): repeat_components(s1) with timed('run 1x components #2'): repeat_components(s1) with timed('warmup'): repeat_sampled_components(s1, tf.constant(1)) with timed('run 1x sampled components (10 samples) #1'): repeat_sampled_components(s1, tf.constant(10)) with timed('run 1x sampled components (10 samples) #2'): repeat_sampled_components(s1, tf.constant(10)) gis.log_metrics()
def run_guarded(self, simulation, const_dict): period = const_dict['period'] print() for k, v in self.subprocesses: print(" *", end=' ') if k is not None: print(k, end=' ') utils.timed(v.run_guarded, simulation, const_dict) # print "done." simulation.start_console(v.entity, period, const_dict['__globals__']) if config.autodump is not None: self._autodump(period) if config.autodiff is not None: self._autodiff(period) if self.purge: self.entity.purge_locals()
def run_guarded(self, simulation, const_dict): global max_vars periods = const_dict['periods'] idx = const_dict['period_idx'] period = periods[idx] print() for k, v in self.subprocesses: # print(" *", end=' ') if k is not None: print(k, end=' ') utils.timed(v.run_guarded, simulation, const_dict) # print "done." simulation.start_console(v.entity, period, const_dict['__globals__']) if config.autodump is not None: self._autodump(period) if config.autodiff is not None: self._autodiff(period) # purge all local variables temp_vars = self.entity.temp_variables all_vars = self.entity.variables local_var_names = set(temp_vars.keys()) - set(all_vars.keys()) num_locals = len(local_var_names) if config.debug and num_locals: local_vars = [v for k, v in temp_vars.iteritems() if k in local_var_names and isinstance(v, np.ndarray)] max_vars = max(max_vars, num_locals) temp_mem = sum(v.nbytes for v in local_vars) avgsize = sum(v.dtype.itemsize for v in local_vars) / num_locals print(("purging {} variables (max {}), will free {} of memory " "(avg field size: {} b)".format(num_locals, max_vars, utils.size2str(temp_mem), avgsize))) for var in local_var_names: del temp_vars[var]
def test_update_and_caching(): N = 1000 K = 100 g = nx.random_graphs.powerlaw_cluster_graph(N, 3, 0.5) with timed('TFGraph and GraphIsing'): tfg = TFGraph(g) gis = GraphIsing([tfg] * K, N, N * 4) s0 = gis.initial_spins(-1.0) @tf.function def repeat(iters, data): for i in range(iters): data = gis.update(data, 0.5) return data with timed('single #1'): s = repeat(tf.constant(1), s0) with timed('single #2'): s = repeat(tf.constant(1), s0) with timed('repeated(10) #1'): s = repeat(tf.constant(10), s0) with timed('repeated(10) #2'): s = repeat(tf.constant(10), s0) #print([cf.structured_input_signature for cf in repeat._list_all_concrete_functions_for_serialization()]) assert len(repeat._list_all_concrete_functions_for_serialization()) == 1 gis.set_graphs([tfg] * K) with timed('single #3'): s = repeat(tf.constant(1), -s0) assert len(repeat._list_all_concrete_functions_for_serialization()) == 1 gis.log_metrics()
def test_create(): N = 10 K = 10 g = nx.random_graphs.powerlaw_cluster_graph(N, 3, 0.5) # test exact sizes with timed('TFGraph'): tfg = TFGraph(g, N, N * 4) with timed('GraphIsing'): gis = GraphIsing(K, N, N * 4) with timed('set_graphs'): gis.set_graphs([tfg] * K) with timed('GraphIsing (with graphs)'): gis2 = GraphIsing([tfg] * K, N, N * 4) # smaller grapn can be set to gis with timed('set_graphs with smaller TFGraph'): g2 = nx.random_graphs.powerlaw_cluster_graph(N / 2, 3, 0.5) tfg2 = TFGraph(g2) gis.set_graphs([tfg2] * (2 * K // 3)) assert (gis.v_node_masks.numpy()[0, :g2.order()] == True).all() assert (gis.v_node_masks.numpy()[0, g2.order():] == False).all() # gis with auto sizes with timed('auto sized TFGraph and GraphIsing'): g3 = nx.random_graphs.powerlaw_cluster_graph(N, 3, 0.5) tfg3 = TFGraph(g3) gis3 = GraphIsing([tfg3] * K) gis.set_graphs([tfg2] * K)
def report_file_generate(self, report_id): from report import Report from memdb.report_cache import ReportCache, ReportTask report_cache = ReportCache() aggregated = report_cache.get_report_aggregated(report_id) if not aggregated: aggregated = get_aggregation(report_id) aggregated = ReportCache.unpack_aggregated(ReportCache.pack_aggregated(aggregated)) report_generator = Report.get_report(report_id.report_type) with timed("rendering for %s" % report_id): data = report_generator.render(aggregated, report_id) report_cache.set_report(report_id, data, report_generator.report_cache_time) ReportTask().remove(report_id)
def user_tiles(adapter, count=5000): query = timed(adapter.query_tile_time_percent) time_sum = 0 with open('./tiles.txt') as f: queries = (tuple(int(x) for x in line.split('/')[1:5]) for line in f) for zoom, resolution, x, y in itertools.islice(queries, count): time_v, count_v = query(x, y, zoom, resolution, 0.0, 1.0) time_sum += time_v.microseconds time.sleep(DELAY) average = time_sum / count / 1.e6 print 'User queries: average time {}'.format(average)
def get_tenant_usage(self, tenant_id, meter_name, start, end, limit=None): """ Queries ceilometer for all the entries in a given range, for a given meter, from this tenant.""" query = [self.filter('timestamp', 'ge', start), self.filter('timestamp', 'lt', end)] if tenant_id: query.append(self.filter('project_id', 'eq', tenant_id)) if meter_name: query.append(self.filter('meter', 'eq', meter_name)) with timed('fetch global usage for meter %s' % meter_name): result = openstack.client_ceilometer.new_samples.list(q=query, limit=limit) log.debug("Get usage for tenant: {} and meter_name {} ({} - {}). Number records: {}", tenant_id, meter_name, start, end, len(result)) return result
def report_file_generate(self, report_id): from report import Report from memdb.report_cache import ReportCache, ReportTask report_cache = ReportCache() aggregated = report_cache.get_report_aggregated(report_id) if not aggregated: aggregated = get_aggregation(report_id) aggregated = ReportCache.unpack_aggregated( ReportCache.pack_aggregated(aggregated)) report_generator = Report.get_report(report_id.report_type) with timed("rendering for %s" % report_id): data = report_generator.render(aggregated, report_id) report_cache.set_report(report_id, data, report_generator.report_cache_time) ReportTask().remove(report_id)
def filter_and_group(usage): usage_by_resource = defaultdict(list) with timed("filter and group by resource"): trust_sources = set(conf.fitter.trust_sources) for u in usage: # the user can make their own samples, including those # that would collide with what we care about for # billing. # if we have a list of trust sources configured, then # discard everything not matching. if trust_sources and u.source not in trust_sources: logbook.warning('ignoring untrusted usage sample from source `{}`', u['source']) continue resource_id = u.resource_id usage_by_resource[resource_id].append(u) return usage_by_resource
def filter_and_group(usage): usage_by_resource = defaultdict(list) with timed("filter and group by resource"): trust_sources = set(conf.fitter.trust_sources) for u in usage: # the user can make their own samples, including those # that would collide with what we care about for # billing. # if we have a list of trust sources configured, then # discard everything not matching. if trust_sources and u.source not in trust_sources: logbook.warning( 'ignoring untrusted usage sample from source `{}`', u['source']) continue resource_id = u.resource_id usage_by_resource[resource_id].append(u) return usage_by_resource
def region_time(adapter, latlon_mag=160, latlon_step=160, time_steps=5, zoom=4): lat0s = lat1s = lon0s = lon1s = [float(x) / 180. for x in range(-latlon_mag, latlon_mag + latlon_step, latlon_step)] starts = ends = [float(x) / time_steps for x in range(time_steps)] query = timed(adapter.query_region_latlon_time_percent) count = 0 time_sum = 0. for lat0, lat1, lon0, lon1, start, end in itertools.product(lat0s, lat1s, lon0s, lon1s, starts, ends): if lat0 >= lat1 or lon0 >= lon1 or start >= end: continue time_v, count_v = query(lat0, lon0, lat1, lon1, zoom, start, end) count += 1 time_sum += time_v.microseconds time.sleep(DELAY) print 'For regions, average time {}'.format(time_sum / count / 1.e6)
def get_tenant_usage(self, tenant_id, meter_name, start, end, limit=None): """ Queries ceilometer for all the entries in a given range, for a given meter, from this tenant.""" query = [ self.filter('timestamp', 'ge', start), self.filter('timestamp', 'lt', end) ] if tenant_id: query.append(self.filter('project_id', 'eq', tenant_id)) if meter_name: query.append(self.filter('meter', 'eq', meter_name)) with timed('fetch global usage for meter %s' % meter_name): result = openstack.client_ceilometer.new_samples.list(q=query, limit=limit) log.debug( "Get usage for tenant: {} and meter_name {} ({} - {}). Number records: {}", tenant_id, meter_name, start, end, len(result)) return result
def train(self, reg=.9, method='L-BFGS-B', disp=True, maxiter=100): """ if implementing self.{obj(W),grad(W)} to be used with scipy optimization """ print '[START] SML/SGD Training\n\nTR/VAL/TE SIZES: %s\n' % self.crf.Ns obj, grad = self.regularize(reg) if reg > 0 else (self.obj, self.grad) self.val_loss, self.Ws_val = [], [] def val(W): print 'current W - obj: %s, norm: %s' % (obj(W), np.linalg.norm(W)) Ws = self.crf.split_W(W) with timed('Validation (MAP predict)', skip=''): loss = self.ev(self.crf.Y_v, [self.crf.MAP(x, Ws) for x in self.crf.X_v]) print '\tVAL LOSS: %s\n' % self.ev.get_names(loss) self.val_loss.append(loss) self.Ws_val.append(np.array(W)) with timed('Scipy Opt: %s' % method, self): try: self.opt = minimize(obj, self.W_opt, method=method, jac=grad, callback=val, options={'maxiter': maxiter, 'disp': disp}) self.W_opt = self.opt.x except KeyboardInterrupt: print '\nINFO - Manually exited Scipy training' return self.W_opt, self.val_loss, self.Ws_val
def tile_time_resolution(adapter, time_steps=5, zoom=4, tile_samples=20, resolution=8): random.seed(1) all_coords = list(itertools.product(range(0, 2**zoom), range(0, 2**zoom))) coords = random.sample(all_coords, tile_samples) starts = ends = [float(x) / time_steps for x in range(time_steps)] query = timed(adapter.query_tile_time_percent) count = 0 time_sum = 0 for start, end, (x, y) in itertools.product(starts, ends, coords): if start >= end: continue time_v, count_v = query(x, y, zoom, resolution, start, end) count += 1 time_sum += time_v.microseconds time.sleep(DELAY) average = time_sum / count / 1.e6 print 'At resolution {} average time {}'.format(resolution, average)
def validate_on_end(): X, y = load_data() days, ids, feats = X.shape assert y.shape == (days, ids) assert ids == ALL_IDS assert days == DATA_DAYS - SKIP_DAYS valid_X = X[-VALID_DAYS:].reshape(-1, feats) valid_y = y[-VALID_DAYS:].flatten() train_X = X[:-VALID_DAYS].reshape(-1, feats) train_y = y[:-VALID_DAYS].flatten() del X del y gc.collect() with timed(f'training lightgbm with X.shape={train_X.shape}'): model = lgb.LGBMRegressor(n_estimators=100) model.fit(train_X, train_y) print('train error:') valid_stats(model.predict(train_X), train_y, should_print=True) print('valid error:') valid_stats(model.predict(valid_X), valid_y, should_print=True)
import utils from contextlib import closing from django.db import connection from django.utils import timezone def sql_simple_insert(n_records): with closing(connection.cursor()) as cursor: for i in xrange(0, n_records): cursor.execute( 'INSERT INTO app_testmodel (field_1, field_2, field_3)' 'VALUES (%s, %s, %s)', (i, str(i), timezone.now()), ) if __name__ == '__main__': utils.timed(sql_simple_insert)
import utils from contextlib import closing from django.db import connection from django.utils import timezone def sql_batch_insert(n_records): sql = 'INSERT INTO app_testmodel (field_1, field_2, field_3) VALUES {}'.format( ', '.join(['(%s, %s, %s)'] * n_records), ) params = [] for i in xrange(0, n_records): params.extend([i, str(i), timezone.now()]) with closing(connection.cursor()) as cursor: cursor.execute(sql, params) if __name__ == '__main__': utils.timed(sql_batch_insert)
import utils from contextlib import closing import csv from cStringIO import StringIO from django.db import connection from django.utils import timezone def copy_from(n_records): stream = StringIO() writer = csv.writer(stream, delimiter='\t') for i in xrange(0, n_records): writer.writerow([i, str(i), timezone.now().isoformat()]) stream.seek(0) with closing(connection.cursor()) as cursor: cursor.copy_from( file=stream, table='app_testmodel', sep='\t', columns=('field_1', 'field_2', 'field_3'), ) if __name__ == '__main__': utils.timed(copy_from)
def _post_build_process(self, model): with timed(f"\nPost processing model..."): model = toolz.pipe(model, *self.config.post_build_processors) return model
def test(self, x, y): with timed('predict'): y_pred = model.predict(x) print('========== accuracy_score = {}'.format(accuracy_score(y, y_pred))) print('========== confusion_matrix:') print(confusion_matrix(y, y_pred))
output_entities = output_file.create_group("/", "entities", "Entities") for table in input_file.iterNodes(input_root.entities): table_fields = get_fields(table) table_fields = [(fname, ftype) for fname, ftype in table_fields if fname not in todrop] size = (len(table) * table.dtype.itemsize) / 1024.0 / 1024.0 #noinspection PyProtectedMember print(" * copying table %s (%.2f Mb) ..." % (table._v_name, size), end=' ') copy_table(table, output_entities, table_fields) print("done.") input_file.close() output_file.close() if __name__ == '__main__': import sys import platform print("LIAM HDF5 drop fields %s using Python %s (%s)\n" % \ (__version__, platform.python_version(), platform.architecture()[0])) args = sys.argv if len(args) < 4: print("Usage: %s inputpath outputpath field1 [field2 ...]" % args[0]) sys.exit() timed(dropfields, args[1], args[2], args[3:])
def _augment(self, dataset): with timed("\nAugmenting dataset..."): augmented = Transform(dataset, self.config.augmentations) return augmented
def run(self, run_console=False): start_time = time.time() h5in, h5out, globals_data = timed(self.data_source.run, self.globals_def, entity_registry, self.init_period) if config.autodump or config.autodiff: if config.autodump: fname, _ = config.autodump mode = 'w' else: # config.autodiff fname, _ = config.autodiff mode = 'r' fpath = os.path.join(config.output_directory, fname) h5_autodump = tables.open_file(fpath, mode=mode) config.autodump_file = h5_autodump else: h5_autodump = None # input_dataset = self.data_source.run(self.globals_def, # entity_registry) # output_dataset = self.data_sink.prepare(self.globals_def, # entity_registry) # output_dataset.copy(input_dataset, self.init_period - 1) # for entity in input_dataset: # indexed_array = buildArrayForPeriod(entity) # tell numpy we do not want warnings for x/0 and 0/0 np.seterr(divide='ignore', invalid='ignore') process_time = defaultdict(float) period_objects = {} eval_ctx = EvaluationContext(self, self.entities_map, globals_data) def simulate_period(period_idx, period, periods, processes, entities, init=False): period_start_time = time.time() # set current period eval_ctx.period = period if config.log_level in ("procedures", "processes"): print() print("period", period, end=" " if config.log_level == "periods" else "\n") if init and config.log_level in ("procedures", "processes"): for entity in entities: print(" * %s: %d individuals" % (entity.name, len(entity.array))) else: if config.log_level in ("procedures", "processes"): print("- loading input data") for entity in entities: print(" *", entity.name, "...", end=' ') timed(entity.load_period_data, period) print(" -> %d individuals" % len(entity.array)) else: for entity in entities: entity.load_period_data(period) for entity in entities: entity.array_period = period entity.array['period'] = period if processes: # build context for this period: const_dict = {'period_idx': period_idx + 1, 'periods': periods, 'periodicity': time_period[self.time_scale] * (1 - 2 * (self.retro)), 'longitudinal': self.longitudinal, 'format_date': self.time_scale, 'pension': None, '__simulation__': self, 'period': period, 'nan': float('nan'), '__globals__': globals_data} assert(periods[period_idx + 1] == period) num_processes = len(processes) for p_num, process_def in enumerate(processes, start=1): process, periodicity, start = process_def if config.log_level in ("procedures", "processes"): print("- %d/%d" % (p_num, num_processes), process.name, end=' ') print("...", end=' ') # TDOD: change that if isinstance(periodicity, int): if period_idx % periodicity == 0: elapsed, _ = gettime(process.run_guarded, self, const_dict) else: elapsed = 0 print("skipped (periodicity)") else: assert periodicity in time_period periodicity_process = time_period[periodicity] periodicity_simul = time_period[self.time_scale] month_idx = period % 100 # first condition, to run a process with start == 12 # each year even if year are yyyy01 # modify start if periodicity_simul is not month start = int(start / periodicity_simul - 0.01) * periodicity_simul + 1 if (periodicity_process <= periodicity_simul and self.time_scale != 'year0') or ( month_idx % periodicity_process == start % periodicity_process): const_dict['periodicity'] = periodicity_process * (1 - 2 * (self.retro)) elapsed, _ = gettime(process.run_guarded, self, const_dict) else: elapsed = 0 if config.log_level in ("procedures", "processes"): print("skipped (periodicity)") process_time[process.name] += elapsed if config.log_level in ("procedures", "processes"): if config.show_timings: print("done (%s elapsed)." % time2str(elapsed)) else: print("done.") self.start_console(eval_ctx) # update longitudinal person = [x for x in entities if x.name == 'person'][0] # maybe we have a get_entity or anything more nice than that #TODO: check id = person.array.columns['id'] for varname in ['sali', 'workstate']: var = person.array.columns[varname] if init: fpath = self.data_source.input_path input_file = HDFStore(fpath, mode="r") if 'longitudinal' in input_file.root: input_longitudinal = input_file.root.longitudinal if varname in input_longitudinal: self.longitudinal[varname] = input_file['/longitudinal/' + varname] if period not in self.longitudinal[varname].columns: table = DataFrame({'id': id, period: var}) self.longitudinal[varname] = self.longitudinal[varname].merge( table, on='id', how='outer') else: # when one variable is not in the input_file self.longitudinal[varname] = DataFrame({'id': id, period: var}) else: # when there is no longitudinal in the dataset self.longitudinal[varname] = DataFrame({'id': id, period: var}) else: table = DataFrame({'id': id, period: var}) if period in self.longitudinal[varname]: import pdb pdb.set_trace() self.longitudinal[varname] = self.longitudinal[varname].merge(table, on='id', how='outer') if config.log_level in ("procedures", "processes"): print("- storing period data") for entity in entities: print(" *", entity.name, "...", end=' ') timed(entity.store_period_data, period) print(" -> %d individuals" % len(entity.array)) else: for entity in entities: entity.store_period_data(period) # print " - compressing period data" # for entity in entities: # print " *", entity.name, "...", # for level in range(1, 10, 2): # print " %d:" % level, # timed(entity.compress_period_data, level) period_objects[period] = sum(len(entity.array) for entity in entities) period_elapsed_time = time.time() - period_start_time if config.log_level in ("procedures", "processes"): print("period %d" % period, end=' ') print("done", end=' ') if config.show_timings: print("(%s elapsed)" % time2str(period_elapsed_time), end="") if init: print(".") else: main_elapsed_time = time.time() - main_start_time periods_done = period_idx + 1 remaining_periods = self.periods - periods_done avg_time = main_elapsed_time / periods_done # future_time = period_elapsed_time * 0.4 + avg_time * 0.6 remaining_time = avg_time * remaining_periods print(" - estimated remaining time: %s." % time2str(remaining_time)) else: print() print(""" ===================== starting simulation =====================""") try: assert(self.time_scale in time_period) month_periodicity = time_period[self.time_scale] time_direction = 1 - 2 * (self.retro) time_step = month_periodicity * time_direction periods = [ self.init_period + int(t / 12) * 100 + t % 12 for t in range(0, (self.periods + 1) * time_step, time_step) ] if self.time_scale == 'year0': periods = [self.init_period + t for t in range(0, (self.periods + 1))] print("simulated period are going to be: ", periods) init_start_time = time.time() simulate_period(0, self.init_period, [None, periods[0]], self.init_processes, self.entities, init=True) time_init = time.time() - init_start_time main_start_time = time.time() for period_idx, period in enumerate(periods[1:]): period_start_time = time.time() simulate_period(period_idx, period, periods, self.processes, self.entities) # if self.legislation: # if not self.legislation['ex_post']: # # elapsed, _ = gettime(liam2of.main,period) # process_time['liam2of'] += elapsed # elapsed, _ = gettime(of_on_liam.main,self.legislation['annee'],[period]) # process_time['legislation'] += elapsed # elapsed, _ = gettime(merge_leg.merge_h5,self.data_source.output_path, # "C:/Til/output/"+"simul_leg.h5",period) # process_time['merge_leg'] += elapsed time_elapsed = time.time() - period_start_time print("period %d done" % period, end=' ') if config.show_timings: print("(%s elapsed)." % time2str(time_elapsed)) else: print() total_objects = sum(period_objects[period] for period in periods) total_time = time.time() - main_start_time # if self.legislation: # if self.legislation['ex_post']: # # elapsed, _ = gettime(liam2of.main) # process_time['liam2of'] += elapsed # elapsed, _ = gettime(of_on_liam.main,self.legislation['annee']) # process_time['legislation'] += elapsed # # TODO: faire un programme a part, so far ca ne marche pas pour l'ensemble # # adapter n'est pas si facile, comme on veut economiser une table, # # on ne peut pas faire de append directement parce qu on met 2010 apres 2011 # # a un moment dans le calcul # elapsed, _ = gettime(merge_leg.merge_h5,self.data_source.output_path, # "C:/Til/output/"+"simul_leg.h5",None) # process_time['merge_leg'] += elapsed if self.final_stat: elapsed, _ = gettime(start, period) process_time['Stat'] += elapsed total_time = time.time() - main_start_time time_year = 0 if len(periods) > 1: nb_year_approx = periods[-1] / 100 - periods[1] / 100 if nb_year_approx > 0: time_year = total_time / nb_year_approx try: ind_per_sec = str(int(total_objects / total_time)) except ZeroDivisionError: ind_per_sec = 'inf' print(""" ========================================== simulation done ========================================== * %s elapsed * %d individuals on average * %s individuals/s/period on average * %s second for init_process * %s time/period in average * %s time/year in average ========================================== """ % ( time2str(time.time() - start_time), total_objects / self.periods, ind_per_sec, time2str(time_init), time2str(total_time / self.periods), time2str(time_year)) ) show_top_processes(process_time, 10) # if config.debug: # show_top_expr() if run_console: console_ctx = eval_ctx.clone(entity_name=self.default_entity) c = console.Console(console_ctx) c.run() finally: if h5in is not None: h5in.close() h5out.close() if h5_autodump is not None: h5_autodump.close()
def simulate_period(period_idx, period, periods, processes, entities, init=False): period_start_time = time.time() # set current period eval_ctx.period = period if config.log_level in ("procedures", "processes"): print() print("period", period, end=" " if config.log_level == "periods" else "\n") if init and config.log_level in ("procedures", "processes"): for entity in entities: print(" * %s: %d individuals" % (entity.name, len(entity.array))) else: if config.log_level in ("procedures", "processes"): print("- loading input data") for entity in entities: print(" *", entity.name, "...", end=' ') timed(entity.load_period_data, period) print(" -> %d individuals" % len(entity.array)) else: for entity in entities: entity.load_period_data(period) for entity in entities: entity.array_period = period entity.array['period'] = period if processes: # build context for this period: const_dict = {'period_idx': period_idx + 1, 'periods': periods, 'periodicity': time_period[self.time_scale] * (1 - 2 * (self.retro)), 'longitudinal': self.longitudinal, 'format_date': self.time_scale, 'pension': None, '__simulation__': self, 'period': period, 'nan': float('nan'), '__globals__': globals_data} assert(periods[period_idx + 1] == period) num_processes = len(processes) for p_num, process_def in enumerate(processes, start=1): process, periodicity, start = process_def if config.log_level in ("procedures", "processes"): print("- %d/%d" % (p_num, num_processes), process.name, end=' ') print("...", end=' ') # TDOD: change that if isinstance(periodicity, int): if period_idx % periodicity == 0: elapsed, _ = gettime(process.run_guarded, self, const_dict) else: elapsed = 0 print("skipped (periodicity)") else: assert periodicity in time_period periodicity_process = time_period[periodicity] periodicity_simul = time_period[self.time_scale] month_idx = period % 100 # first condition, to run a process with start == 12 # each year even if year are yyyy01 # modify start if periodicity_simul is not month start = int(start / periodicity_simul - 0.01) * periodicity_simul + 1 if (periodicity_process <= periodicity_simul and self.time_scale != 'year0') or ( month_idx % periodicity_process == start % periodicity_process): const_dict['periodicity'] = periodicity_process * (1 - 2 * (self.retro)) elapsed, _ = gettime(process.run_guarded, self, const_dict) else: elapsed = 0 if config.log_level in ("procedures", "processes"): print("skipped (periodicity)") process_time[process.name] += elapsed if config.log_level in ("procedures", "processes"): if config.show_timings: print("done (%s elapsed)." % time2str(elapsed)) else: print("done.") self.start_console(eval_ctx) # update longitudinal person = [x for x in entities if x.name == 'person'][0] # maybe we have a get_entity or anything more nice than that #TODO: check id = person.array.columns['id'] for varname in ['sali', 'workstate']: var = person.array.columns[varname] if init: fpath = self.data_source.input_path input_file = HDFStore(fpath, mode="r") if 'longitudinal' in input_file.root: input_longitudinal = input_file.root.longitudinal if varname in input_longitudinal: self.longitudinal[varname] = input_file['/longitudinal/' + varname] if period not in self.longitudinal[varname].columns: table = DataFrame({'id': id, period: var}) self.longitudinal[varname] = self.longitudinal[varname].merge( table, on='id', how='outer') else: # when one variable is not in the input_file self.longitudinal[varname] = DataFrame({'id': id, period: var}) else: # when there is no longitudinal in the dataset self.longitudinal[varname] = DataFrame({'id': id, period: var}) else: table = DataFrame({'id': id, period: var}) if period in self.longitudinal[varname]: import pdb pdb.set_trace() self.longitudinal[varname] = self.longitudinal[varname].merge(table, on='id', how='outer') if config.log_level in ("procedures", "processes"): print("- storing period data") for entity in entities: print(" *", entity.name, "...", end=' ') timed(entity.store_period_data, period) print(" -> %d individuals" % len(entity.array)) else: for entity in entities: entity.store_period_data(period) # print " - compressing period data" # for entity in entities: # print " *", entity.name, "...", # for level in range(1, 10, 2): # print " %d:" % level, # timed(entity.compress_period_data, level) period_objects[period] = sum(len(entity.array) for entity in entities) period_elapsed_time = time.time() - period_start_time if config.log_level in ("procedures", "processes"): print("period %d" % period, end=' ') print("done", end=' ') if config.show_timings: print("(%s elapsed)" % time2str(period_elapsed_time), end="") if init: print(".") else: main_elapsed_time = time.time() - main_start_time periods_done = period_idx + 1 remaining_periods = self.periods - periods_done avg_time = main_elapsed_time / periods_done # future_time = period_elapsed_time * 0.4 + avg_time * 0.6 remaining_time = avg_time * remaining_periods print(" - estimated remaining time: %s." % time2str(remaining_time)) else: print()
import utils from django.utils import timezone from app import models def orm_bulk_create(n_records): instances = [ models.TestModel( field_1=i, field_2=str(i), field_3=timezone.now(), ) for i in xrange(0, n_records) ] models.TestModel.objects.bulk_create(instances) if __name__ == '__main__': utils.timed(orm_bulk_create)
def load(self): return timed(self.data_source.load, self.globals_def, self.entities_map)
def simulate_period(period_idx, period, processes, entities, init=False): period_start_time = time.time() # set current period eval_ctx.period = period if config.log_level in ("functions", "processes"): print() print("period", period, end=" " if config.log_level == "periods" else "\n") if init and config.log_level in ("functions", "processes"): for entity in entities: print(" * %s: %d individuals" % (entity.name, len(entity.array))) else: if config.log_level in ("functions", "processes"): print("- loading input data") for entity in entities: print(" *", entity.name, "...", end=' ') timed(entity.load_period_data, period) print(" -> %d individuals" % len(entity.array)) else: for entity in entities: entity.load_period_data(period) for entity in entities: entity.array_period = period entity.array['period'] = period if processes: num_processes = len(processes) for p_num, process_def in enumerate(processes, start=1): process, periodicity = process_def # set current entity eval_ctx.entity_name = process.entity.name if config.log_level in ("functions", "processes"): print("- %d/%d" % (p_num, num_processes), process.name, end=' ') print("...", end=' ') if period_idx % periodicity == 0: elapsed, _ = gettime(process.run_guarded, eval_ctx) else: elapsed = 0 if config.log_level in ("functions", "processes"): print("skipped (periodicity)") process_time[process.name] += elapsed if config.log_level in ("functions", "processes"): if config.show_timings: print("done (%s elapsed)." % time2str(elapsed)) else: print("done.") self.start_console(eval_ctx) if config.log_level in ("functions", "processes"): print("- storing period data") for entity in entities: print(" *", entity.name, "...", end=' ') timed(entity.store_period_data, period) print(" -> %d individuals" % len(entity.array)) else: for entity in entities: entity.store_period_data(period) # print " - compressing period data" # for entity in entities: # print " *", entity.name, "...", # for level in range(1, 10, 2): # print " %d:" % level, # timed(entity.compress_period_data, level) period_objects[period] = sum(len(entity.array) for entity in entities) period_elapsed_time = time.time() - period_start_time if config.log_level in ("functions", "processes"): print("period %d" % period, end=' ') print("done", end=' ') if config.show_timings: print("(%s elapsed)" % time2str(period_elapsed_time), end="") if init: print(".") else: main_elapsed_time = time.time() - main_start_time periods_done = period_idx + 1 remaining_periods = self.periods - periods_done avg_time = main_elapsed_time / periods_done # future_time = period_elapsed_time * 0.4 + avg_time * 0.6 remaining_time = avg_time * remaining_periods print(" - estimated remaining time: %s." % time2str(remaining_time)) else: print()
def run_single(self, run_console=False, run_num=None): start_time = time.time() input_dataset = timed(self.data_source.load, self.globals_def, self.entities_map) globals_data = input_dataset.get('globals') timed(self.data_sink.prepare, self.globals_def, self.entities_map, input_dataset, self.start_period - 1) print(" * building arrays for first simulated period") for ent_name, entity in self.entities_map.iteritems(): print(" -", ent_name, "...", end=' ') # TODO: this whole process of merging all periods is very # opinionated and does not allow individuals to die/disappear # before the simulation starts. We couldn't for example, # take the output of one of our simulation and # re-simulate only some years in the middle, because the dead # would be brought back to life. In conclusion, it should be # optional. timed(entity.build_period_array, self.start_period - 1) print("done.") if config.autodump or config.autodiff: if config.autodump: fname, _ = config.autodump mode = 'w' else: # config.autodiff fname, _ = config.autodiff mode = 'r' fpath = os.path.join(config.output_directory, fname) h5_autodump = tables.open_file(fpath, mode=mode) config.autodump_file = h5_autodump else: h5_autodump = None # tell numpy we do not want warnings for x/0 and 0/0 np.seterr(divide='ignore', invalid='ignore') process_time = defaultdict(float) period_objects = {} eval_ctx = EvaluationContext(self, self.entities_map, globals_data) def simulate_period(period_idx, period, processes, entities, init=False): period_start_time = time.time() # set current period eval_ctx.period = period if config.log_level in ("functions", "processes"): print() print("period", period, end=" " if config.log_level == "periods" else "\n") if init and config.log_level in ("functions", "processes"): for entity in entities: print(" * %s: %d individuals" % (entity.name, len(entity.array))) else: if config.log_level in ("functions", "processes"): print("- loading input data") for entity in entities: print(" *", entity.name, "...", end=' ') timed(entity.load_period_data, period) print(" -> %d individuals" % len(entity.array)) else: for entity in entities: entity.load_period_data(period) for entity in entities: entity.array_period = period entity.array['period'] = period if processes: num_processes = len(processes) for p_num, process_def in enumerate(processes, start=1): process, periodicity = process_def # set current entity eval_ctx.entity_name = process.entity.name if config.log_level in ("functions", "processes"): print("- %d/%d" % (p_num, num_processes), process.name, end=' ') print("...", end=' ') if period_idx % periodicity == 0: elapsed, _ = gettime(process.run_guarded, eval_ctx) else: elapsed = 0 if config.log_level in ("functions", "processes"): print("skipped (periodicity)") process_time[process.name] += elapsed if config.log_level in ("functions", "processes"): if config.show_timings: print("done (%s elapsed)." % time2str(elapsed)) else: print("done.") self.start_console(eval_ctx) if config.log_level in ("functions", "processes"): print("- storing period data") for entity in entities: print(" *", entity.name, "...", end=' ') timed(entity.store_period_data, period) print(" -> %d individuals" % len(entity.array)) else: for entity in entities: entity.store_period_data(period) # print " - compressing period data" # for entity in entities: # print " *", entity.name, "...", # for level in range(1, 10, 2): # print " %d:" % level, # timed(entity.compress_period_data, level) period_objects[period] = sum(len(entity.array) for entity in entities) period_elapsed_time = time.time() - period_start_time if config.log_level in ("functions", "processes"): print("period %d" % period, end=' ') print("done", end=' ') if config.show_timings: print("(%s elapsed)" % time2str(period_elapsed_time), end="") if init: print(".") else: main_elapsed_time = time.time() - main_start_time periods_done = period_idx + 1 remaining_periods = self.periods - periods_done avg_time = main_elapsed_time / periods_done # future_time = period_elapsed_time * 0.4 + avg_time * 0.6 remaining_time = avg_time * remaining_periods print(" - estimated remaining time: %s." % time2str(remaining_time)) else: print() print(""" ===================== starting simulation =====================""") try: simulate_period(0, self.start_period - 1, self.init_processes, self.entities, init=True) main_start_time = time.time() periods = range(self.start_period, self.start_period + self.periods) for period_idx, period in enumerate(periods): simulate_period(period_idx, period, self.processes, self.entities) total_objects = sum(period_objects[period] for period in periods) avg_objects = str(total_objects // self.periods) \ if self.periods else 'N/A' main_elapsed_time = time.time() - main_start_time ind_per_sec = str(int(total_objects / main_elapsed_time)) \ if main_elapsed_time else 'inf' print(""" ========================================== simulation done ========================================== * %s elapsed * %s individuals on average * %s individuals/s/period on average ========================================== """ % (time2str(time.time() - start_time), avg_objects, ind_per_sec)) show_top_processes(process_time, 10) # if config.debug: # show_top_expr() if run_console: ent_name = self.default_entity if ent_name is None and len(eval_ctx.entities) == 1: ent_name = eval_ctx.entities.keys()[0] # FIXME: fresh_data prevents the old (cloned) EvaluationContext # to be referenced from each EntityContext, which lead to period # being fixed to the last period of the simulation. This should # be fixed in EvaluationContext.copy but the proper fix breaks # stuff (see the comments there) console_ctx = eval_ctx.clone(fresh_data=True, entity_name=ent_name) c = console.Console(console_ctx) c.run() finally: self.close() if h5_autodump is not None: h5_autodump.close() if self.minimal_output: output_path = self.data_sink.output_path dirname = os.path.dirname(output_path) try: os.remove(output_path) os.rmdir(dirname) except OSError: print("WARNING: could not delete temporary directory: %r" % dirname)
def load(self): return timed(self.data_source.load, self.globals_def, entity_registry)
elif ent_name in ent_names1: output_array = input1_array elif ent_name in ent_names2: output_array = input2_array else: raise Exception("this shouldn't have happened") output_table.append(output_array) output_table.flush() loop_wh_progress(merge_period, output_periods) print " done." input1_file.close() input2_file.close() output_file.close() if __name__ == '__main__': import sys import platform print "LIAM HDF5 merge %s using Python %s (%s)\n" % \ (__version__, platform.python_version(), platform.architecture()[0]) args = sys.argv if len(args) < 4: print "Usage: %s inputpath1 inputpath2 outputpath" % args[0] sys.exit() timed(merge_h5, args[1], args[2], args[3])
# copy globals if copy_globals: # noinspection PyProtectedMember input_file.root.globals._f_copy(output_file.root, recursive=True) output_entities = output_file.create_group("/", "entities", "Entities") for table in input_file.iterNodes(input_file.root.entities): # noinspection PyProtectedMember print(table._v_name, "...") copy_table(table, output_entities, condition=condition) input_file.close() output_file.close() if __name__ == '__main__': import sys import platform print("LIAM HDF5 filter %s using Python %s (%s)\n" % (__version__, platform.python_version(), platform.architecture()[0])) args = dict(enumerate(sys.argv)) if len(args) < 4: print("""Usage: {} inputpath outputpath condition [copy_globals] where condition is an expression copy_globals is True (default)|False""".format(args[0])) sys.exit() timed(filter_h5, args[1], args[2], args[3], eval(args.get(4, 'True')))
output_file = tables.openFile(output_path, mode="w") # copy globals input_file.root.globals._f_copy(output_file.root, recursive=True) output_entities = output_file.createGroup("/", "entities", "Entities") for table in input_file.iterNodes(input_file.root.entities): print table._v_name, "..." copyTable(table, output_entities, condition=condition) input_file.close() output_file.close() if __name__ == "__main__": import sys import platform print "LIAM HDF5 filter %s using Python %s (%s)\n" % ( __version__, platform.python_version(), platform.architecture()[0], ) args = sys.argv if len(args) < 4: print "Usage: %s inputpath outputpath condition" % args[0] sys.exit() timed(filter_h5, args[1], args[2], args[3])
import utils from contextlib import closing from django.db import connection from django.utils import timezone def sql_simple_insert_executemany(n_records): with closing(connection.cursor()) as cursor: cursor.executemany( 'INSERT INTO app_testmodel (field_1, field_2, field_3)' 'VALUES (%s, %s, %s)', [(i, str(i), timezone.now()) for i in xrange(0, n_records)], ) if __name__ == '__main__': utils.timed(sql_simple_insert_executemany)
def main() -> None: with utils.timed(): print(f"Part 1: {part1()}") with utils.timed(): print(f"Part 2: {part2()}")
def load_data(): if os.path.isfile(XY_CACHE): print('loading from cache') cached = np.load(XY_CACHE) return cached['X'], cached['y'] with timed('loading data from csv...'): day_frame = pd.read_csv('data/calendar.csv') # feats: # day of week # day of month # month # event_1 (encoded in [0, 32]) # snap (encoded in [0, 2**3]) day_feats = np.zeros((ALL_DAYS, 5), dtype=np.uint8) assert len(day_frame) == ALL_DAYS day_feats[:, 0] = day_frame['wday'].values # parse the day part from YYYY-MM-DD day_feats[:, 1] = [int(date.split('-')[2]) for date in day_frame['date']] day_feats[:, 2] = day_frame['month'].values # for simplicity, ignore event_name_2 # TODO try using it _, event_codes = np.unique( day_frame['event_name_1'].values.astype(str), return_inverse=True) day_feats[:, 3] = event_codes # for simplicity, dense-code the snap # TODO eventually match it properly based on geography day_feats[:, 4] = (day_frame['snap_CA'].values + 2 * day_frame['snap_TX'].values + 4 * day_frame['snap_WI'].values) assert np.min(day_feats[:, 0]) == 1 assert np.max(day_feats[:, 0]) == 7 assert np.min(day_feats[:, 1]) == 1 assert np.max(day_feats[:, 1]) == 31 assert np.min(day_feats[:, 2]) == 1 assert np.max(day_feats[:, 2]) == 12 assert np.min(day_feats[:, 3]) == 0 assert np.max(day_feats[:, 3]) == 30 assert np.min(day_feats[:, 4]) == 0 assert np.max(day_feats[:, 4]) == 7 sales_frame = pd.read_csv('data/sales_train_validation.csv') sales = load_sales() # y is (days, ids) after the first year y = sales[SKIP_DAYS:, :] uniques = {} ordinals = {} for col in ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']: u, inverse = np.unique(sales_frame[col], return_inverse=True) uniques[col] = u ordinals[col] = inverse target_days = DATA_DAYS - SKIP_DAYS feats = 5 + 4 + (5 * 2) X = np.zeros((target_days, feats, ALL_IDS), dtype=np.float32) # broadcast over all ids train_day_feats = day_feats[SKIP_DAYS:SKIP_DAYS + target_days, :] X[:, 0:5, :] = train_day_feats.reshape(target_days, 5, 1) # for dept_id, cat_id, store_id, state_id there are few enough values we can just # pass them in as ordinals directly. # broadcast over all days X[:, 5, :] = ordinals['dept_id'] X[:, 6, :] = ordinals['cat_id'] X[:, 7, :] = ordinals['store_id'] X[:, 8, :] = ordinals['state_id'] # for id and item_id, try a bunch of different embeddings. # for now, all sales aggregations must be validation-safe: # they only use info from t - VALID_DAYS and earlier. # start with: # (1) mean, min, max, std, nonzero over previous year # TODO: # try over different windows (month, week) # try over different slices (sharing holiday; sharing day of week; etc.) # ID features for t in range(target_days): if (t % 100) == 0: print(f'{t}/{target_days}') d = SKIP_DAYS + t - VALID_DAYS assert d - 365 >= 0 group = sales[d - 365:d] X[t, 9, :] = np.mean(group, axis=0) X[t, 10, :] = np.min(group, axis=0) X[t, 11, :] = np.max(group, axis=0) X[t, 12, :] = np.std(group, axis=0) X[t, 13, :] = np.count_nonzero(group, axis=0) for i in range(len(uniques['item_id'])): is_item = (ordinals['item_id'] == i) item_group = group[:, is_item] # print(f'{ordinals["item_id"].shape=}') # print(f'{is_item.shape=}, {np.count_nonzero(is_item)=}') # print(f'{group.shape=}, {item_group.shape=}, {np.mean(item_group, axis=0).shape=}, {X[t, 14, is_item].shape=}') X[t, 14, is_item] = np.mean(item_group, axis=0) X[t, 15, is_item] = np.min(item_group, axis=0) X[t, 16, is_item] = np.max(item_group, axis=0) X[t, 17, is_item] = np.std(item_group, axis=0) X[t, 18, is_item] = np.count_nonzero(item_group, axis=0) X = np.swapaxes(X, 1, 2) assert X.shape == (target_days, ALL_IDS, feats) with timed('saving...'): np.savez_compressed(XY_CACHE, X=X, y=y) return X, y
def merge_h5(input1_path, input2_path, output_path): input1_file = tables.open_file(input1_path) input2_file = tables.open_file(input2_path) output_file = tables.open_file(output_path, mode="w") input1root = input1_file.root input2root = input2_file.root merge_group(input1root, input2root, 'globals', output_file, 'PERIOD') merge_group(input1root, input2root, 'entities', output_file, 'period') input1_file.close() input2_file.close() output_file.close() if __name__ == '__main__': import sys import platform print("LIAM HDF5 merge %s using Python %s (%s)\n" % (__version__, platform.python_version(), platform.architecture()[0])) args = sys.argv if len(args) < 4: print("Usage: %s inputpath1 inputpath2 outputpath" % args[0]) sys.exit() timed(merge_h5, args[1], args[2], args[3])
return l1_norm(exp(self.θ @ atleast_2d(x).T), axis=0) def predict(self, x): # 输出概率最大的数字 return argmax(self.predict_proba(x), axis=0) def test(self, x, y): with timed('predict'): y_pred = model.predict(x) print('========== accuracy_score = {}'.format(accuracy_score(y, y_pred))) print('========== confusion_matrix:') print(confusion_matrix(y, y_pred)) if __name__ == '__main__': with timed('prepare data'): (X_train, y_train), (X_test, y_test) = mnist.load_data() # 注意要加上截距(方便模型学到偏置) X_train = append_bias(X_train) X_test = append_bias(X_test) # 与 softmax_wendesi保持类似的数据规模 X_train, y_train = sample_dataset(X_train, y_train, n=28140, seed=66) X_test, y_test = sample_dataset(X_test, y_test, n=13860, seed=66) model = Softmax() with timed('fit'): # 当η=1e-3 也会出现 inf model.fit(X_train, y_train, n_epoch=100, η=1e-4) model.test(X_test, y_test)
def run(self, run_console=False): start_time = time.time() h5in, h5out, globals_data = timed(self.data_source.run, self.globals_def, entity_registry, self.start_period - 1) if config.autodump or config.autodiff: if config.autodump: fname, _ = config.autodump mode = 'w' else: # config.autodiff fname, _ = config.autodiff mode = 'r' fpath = os.path.join(config.output_directory, fname) h5_autodump = tables.openFile(fpath, mode=mode) config.autodump_file = h5_autodump else: h5_autodump = None # input_dataset = self.data_source.run(self.globals_def, # entity_registry) # output_dataset = self.data_sink.prepare(self.globals_def, # entity_registry) # output_dataset.copy(input_dataset, self.start_period - 1) # for entity in input_dataset: # indexed_array = build_period_array(entity) # tell numpy we do not want warnings for x/0 and 0/0 np.seterr(divide='ignore', invalid='ignore') process_time = defaultdict(float) period_objects = {} def simulate_period(period_idx, period, processes, entities, init=False): print("\nperiod", period) if init: for entity in entities: print(" * %s: %d individuals" % (entity.name, len(entity.array))) else: print("- loading input data") for entity in entities: print(" *", entity.name, "...", end=' ') timed(entity.load_period_data, period) print(" -> %d individuals" % len(entity.array)) for entity in entities: entity.array_period = period entity.array['period'] = period if processes: # build context for this period: const_dict = {'__simulation__': self, 'period': period, 'nan': float('nan'), '__globals__': globals_data} num_processes = len(processes) for p_num, process_def in enumerate(processes, start=1): process, periodicity = process_def print("- %d/%d" % (p_num, num_processes), process.name, end=' ') print("...", end=' ') if period_idx % periodicity == 0: elapsed, _ = gettime(process.run_guarded, self, const_dict) else: elapsed = 0 print("skipped (periodicity)") process_time[process.name] += elapsed if config.show_timings: print("done (%s elapsed)." % time2str(elapsed)) else: print("done.") self.start_console(process.entity, period, globals_data) print("- storing period data") for entity in entities: print(" *", entity.name, "...", end=' ') timed(entity.store_period_data, period) print(" -> %d individuals" % len(entity.array)) # print " - compressing period data" # for entity in entities: # print " *", entity.name, "...", # for level in range(1, 10, 2): # print " %d:" % level, # timed(entity.compress_period_data, level) period_objects[period] = sum(len(entity.array) for entity in entities) try: simulate_period(0, self.start_period - 1, self.init_processes, self.entities, init=True) main_start_time = time.time() periods = range(self.start_period, self.start_period + self.periods) for period_idx, period in enumerate(periods): period_start_time = time.time() simulate_period(period_idx, period, self.processes, self.entities) time_elapsed = time.time() - period_start_time print("period %d done" % period, end=' ') if config.show_timings: print("(%s elapsed)." % time2str(time_elapsed)) else: print() total_objects = sum(period_objects[period] for period in periods) total_time = time.time() - main_start_time try: ind_per_sec = str(int(total_objects / total_time)) except ZeroDivisionError: ind_per_sec = 'inf' print(""" ========================================== simulation done ========================================== * %s elapsed * %d individuals on average * %s individuals/s/period on average ========================================== """ % (time2str(time.time() - start_time), total_objects / self.periods, ind_per_sec)) show_top_processes(process_time, 10) # if config.debug: # show_top_expr() if run_console: c = console.Console(self.console_entity, periods[-1], self.globals_def, globals_data) c.run() finally: if h5in is not None: h5in.close() h5out.close() if h5_autodump is not None: h5_autodump.close()