def build_srl(configs): client = MongoClient(config.MONGO_IP, config.MONGO_PORT) db = client[config.DB] wikipedia = db[config.WIKIMERGE_COLLECTION] documents_id = list(wikipedia.find({}, {"id": 1, "_id": 0}).sort("id")) client.close() start_time = time.time() total = 0 total_extracted = 0 total_skipped = 0 chunks = get_chunks(documents_id, config.CHUNK_SIZE, 'id') if config.NUM_WORKERS == 1: for chunk in chunks: build(chunk, {}) else: pool = multiprocessing.Pool(config.NUM_WORKERS) for res in pool.imap(partial(build, configs=configs), chunks): total += res['processed'] if 'extracted' in res: total_extracted += res['extracted'] total_skipped += res['skipped'] res['total_extracted'] = total_extracted res['total_skipped'] = total_skipped res['total'] = total elapsed = int(time.time() - start_time) res['total_elapsed'] = compress(elapsed) res['elapsed'] = compress(res['elapsed']) logging.info(', '.join("{!s}={!r}".format(key, val) for key, val in res.items())) pool.terminate() elapsed = int(time.time() - start_time) logging.info("Processed {} documents in {} - Total extracted {}".format(total, compress(elapsed), total_extracted)) return
def run_qa(): client = MongoClient(config.MONGO_IP, config.MONGO_PORT) db = client[config.DB] wikipedia = db[config.WIKIPEDIA_COLLECTION] wikidocs = list( wikipedia.find({}, { 'wikidata_id': 1, '_id': 0 }).sort('wikidata_id')) chunks = get_chunks(wikidocs, config.CHUNK_SIZE, 'wikidata_id') del wikidocs start_time = time.time() total = 0 pool = multiprocessing.Pool(config.NUM_WORKERS) for res in pool.imap(qa, chunks): total += res['processed'] res['total'] = total part = int(time.time() - start_time) res['elapsed'] = compress(res['elapsed']) res['total_elapsed'] = compress(part) logging.info( "Processed {processed} ({total} in total) documents in {elapsed} (running time {" "total_elapsed})".format(**res)) pool.terminate() elapsed = int(time.time() - start_time) logging.info("Processed {} documents in {}".format(total, compress(elapsed))) return
def aws(sample=False): if sample: return Section('no AWS environment', 'WHITE', '#333') section = Section('no AWS environment', 'WHITE', '#333') if 'AWS_PROFILE' not in _env: return section section = Section('AWS: %s ' % _env['AWS_PROFILE'], 'GREEN', '#333') if _env.get('AWS_SESSIONS') and _env['AWS_PROFILE'] in _env['AWS_SESSIONS']: try: remaining = _env['AWS_SESSIONS'][_env['AWS_PROFILE'] ][_EXPIRATION] - int(time.time()) if remaining > 300: section = Section('AWS: %s(%s) ' % (_env['AWS_PROFILE'], compress(remaining)), 'GREEN', '#333') else: section = Section('AWS: %s(%s) ' % (_env['AWS_PROFILE'], compress(remaining)), 'GREEN', 'RED') except: pass return section
def validate_data(dataset_path: Path): from natural.date import compress from datetime import timedelta for mf_type in ["train_manifest.json", "test_manifest.json"]: data_file = dataset_path / Path(mf_type) print(f"validating {data_file}.") with Path(data_file).open("r") as pf: data_jsonl = pf.readlines() duration = 0 for (i, s) in enumerate(data_jsonl): try: d = json.loads(s) duration += d["duration"] audio_file = data_file.parent / Path(d["audio_filepath"]) if not audio_file.exists(): raise OSError(f"File {audio_file} not found") except BaseException as e: print(f'failed on {i} with "{e}"') duration_str = compress(timedelta(seconds=duration), pad=" ") print( f"no errors found. seems like a valid {mf_type}. contains {duration_str}sec of audio" )
def compress(value, sign=False, pad=''): '''Wrapper for :func:`natural.date.compress`''' return date.compress(value, sign, pad)
def kfold_validation(train_config: TrainingConfiguration, input_data, input_labels): kf = KFold(n_splits=5, shuffle=True) fold_scores = [] current_fold = 1 start_time = time.time() # there is a bug for RTX gpu when I need to set allow_growth to True to run CNN reset_keras() for train_index, test_index in kf.split(input_data): x_train, x_test = input_data[train_index], input_data[test_index] y_train, y_test = input_labels.values[ train_index], input_labels.values[test_index] x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25) if train_config.additional_train_data is not None: print("Additional data:", train_config.additional_train_data[0].shape) x_train = np.concatenate( [x_train, train_config.additional_train_data[0]]) y_train = np.concatenate( [y_train, train_config.additional_train_data[1]]) train_generator = train_config.generator(x_train, y_train, batch_size=32) x, y = train_generator[0] input_shape = get_input_shape(x[0]) num_classes = y[0].shape[0] print("Training shape: ", input_shape) print("Prediction classes", num_classes) print("Trainig data size:", x_train.shape, "Validation data:", x_val.shape, "Test data:", x_test.shape) x_val = train_config.load_files(x_val) # create 2d conv model model = train_config.create_model_func(input_shape, num_classes) opt = keras.optimizers.Adam(lr=train_config.learning_rate) lr_metric = get_lr_metric(opt) model.compile(loss=bce_with_logits, optimizer=opt, metrics=[tf_lwlrap, lr_metric]) log_folder_name = create_log_dir(train_config) callbacks = [ keras.callbacks.TensorBoard(log_dir=log_folder_name, histogram_freq=0, batch_size=32, write_graph=True, write_grads=False, write_images=False, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None, embeddings_data=None, update_freq='epoch'), EarlyStoppingByLWLRAP(validation_data=(x_val, y_val), patience=20, restore_best_weights=True), keras.callbacks.ReduceLROnPlateau( monitor='val_tf_lwlrap', patience=train_config.reduce_lr_patience, min_lr=train_config.min_lr, factor=train_config.reduce_lr_factor, mode='max'), keras.callbacks.ModelCheckpoint('./models/{}_{}.h5'.format( train_config.model_name, current_fold), monitor='val_tf_lwlrap', verbose=1, save_best_only=True, mode='max') ] model.fit_generator(train_generator, epochs=train_config.num_epoch, callbacks=callbacks, validation_data=(x_val, y_val), verbose=2) x_test_data = train_config.load_files(x_test) y_pred = model.predict(x_test_data) lwlrap = calculate_overall_lwlrap_sklearn(y_test, y_pred) print("Fold {} Score: {}".format(current_fold, lwlrap)) # calculate_and_dump_lwlrap_per_class(x_test, y_test, y_pred, "per_class_lwlrap_fold_{}.csv".format(current_fold)) current_fold += 1 fold_scores.append(lwlrap) break print(fold_scores) print("Average Fold Score:", np.mean(fold_scores)) print("Time taken: {}".format(date.compress(time.time() - start_time)))
def compress(value, sign=False, pad=u''): '''Wrapper for :func:`natural.date.compress`''' return date.compress(value, sign, pad)
def run_benchmark(config): """ Benchmark script for tiledb-vcf""" # Open yaml config file with open(config, 'r') as stream: try: benchmarking_start = time.time() results = [] config = yaml.load(stream) base_cmd = config['base_command'] iterations = config['iterations'] ingestion_files = config['ingestion_files'] attribute_results = {} suite_index = 0 suite_names = [] errors = {} # Get the size of the files being ingested ingestion_size = 0 for ingestion_file in ingestion_files: ingestion_size += os.path.getsize(ingestion_file) / (1024 * 1024) # Loop through each test suite for suite_name, test_set in config['suites'].items(): suite_names.append(suite_name) test_results = {} # Run each suite the given number of iterations iteration_count = 0 for i in range(iterations): iteration_count += 1 array_uri = test_set['array_uri'] group_uri = test_set['group_uri'] dir_to_rm = None if 'group_uri' in test_set: dir_to_rm = group_uri else: dir_to_rm = array_uri if not dir_to_rm is None and os.path.isdir(dir_to_rm): shutil.rmtree(dir_to_rm) if not os.path.isdir(group_uri): pathlib.Path(group_uri).mkdir(parents=True, exist_ok=True) # Run each test in the suite for test in test_set['tests']: # Flush caches flush_caches() test_name = test["name"] logger.info("Starting test %s - %s iteration %d", suite_name, test_name, i) # Add specified arguments cmd = [base_cmd] + test['args'] # Add group uri argument cmd.extend(["-a", array_uri]) # If store or register add ingestion files if test_name == "store" or test_name == "register": cmd.append("-f") cmd.extend(ingestion_files) if test_name == "export": export_path = os.path.join(group_uri, "export") if not os.path.isdir(export_path): os.mkdir(export_path) #cmd.extend(["-p", export_path + os.path.sep]) logger.info("Running: %s", list2cmdline(cmd)) # Time and run test command t0 = time.time() t1 = None try: ret = call(cmd) t1 = time.time() except Exception as e: if not suite_name in errors: errors[suite_name] = {"test_name": []} if not test_name in errors[suite_name]: errors[suite_name][test_name] = [] errors[suite_name][test_name].append({ "iteration": i, "ret_code": ret }) logging.error(traceback.format_exc()) continue array_size = 0 tiledb_file_sizes = None if 'check_array_size' in test and test[ 'check_array_size']: array_size = get_folder_size(array_uri) tiledb_file_sizes = get_tiledb_file_sizes( array_uri) # Save results if not test_name in test_results: test_results[test_name] = { "time": [], "size": [], "file_sizes": {} } test_results[test_name]["time"].append(t1 - t0) test_results[test_name]["size"].append(array_size) if tiledb_file_sizes != None: for file_name, size in tiledb_file_sizes.items(): if not file_name in test_results[test_name][ "file_sizes"]: test_results[test_name]["file_sizes"][ file_name] = [] test_results[test_name]["file_sizes"][ file_name].append(size) # If there was a store test we should save results for printing table at the end if 'store' in test_results: ingestion_times = test_results["store"]["time"] ingestion_time_avg = numpy.average(ingestion_times) size_avg = numpy.average( test_results["store"]["size"]) / (1024 * 1024) ingestion_time_std = numpy.std(ingestion_times) export_time_avg = 'N/A' export_time_std = 'N/A' if 'export' in test_results: export_times = test_results["export"]["time"] export_time_avg = numpy.average(export_times) export_time_std = numpy.std(export_times) results.append([ suite_name, iteration_count, ingestion_time_avg, ingestion_time_std, size_avg, ingestion_size, export_time_avg, export_time_std ]) for file_name, file_sizes in test_results['store'][ "file_sizes"].items(): if not file_name in attribute_results: attribute_results[file_name] = [None] * len( config['suites']) #{suite_name: 'N/A'} file_size_avg = numpy.average(file_sizes) / (1024 * 1024) attribute_results[file_name][ suite_index] = file_size_avg suite_index += 1 # Remove directory to save space again dir_to_rm = None if 'group_uri' in test_set: dir_to_rm = group_uri else: dir_to_rm = array_uri if not dir_to_rm is None and os.path.isdir(dir_to_rm): shutil.rmtree(dir_to_rm) header = [ 'Test', 'Iterations', 'Ingestion Time (seconds)', 'Ingestion Time (seconds) STDDEV', 'Array Size (MB)', 'Ingestion Size (MB)', 'Export Time (seconds)', 'Export Time STDDEV (seconds)' ] t = PrettyTable(header) for result in results: t.add_row(result) data = ",".join(header) + "\n" for result in results: data += ",".join(map(str, result)) + "\n" logger.info(data) print("") print(t) t = PrettyTable() t.add_column("Test", suite_names) for file_name, sizes in attribute_results.items(): t.add_column(file_name, sizes) #for result in attribute_results: # print(result) # t.add_row(result) #for index in range(len(suite_names)): # results = [] #[None] * len(attribute_results) # for file_name, result in attribute_results.items(): # results.append(result[index]) # t.add_column(suite_names[index], results) # Set file_name column #file_name_results = [] #[None] * len(attribute_results) #for file_name, result in attribute_results.items(): # file_name_results.append(file_name) #t.add_column("file_name", file_name_results) print("") print(t) data = ",".join(t.field_names) + "\n" for row in t._get_rows(t._get_options({})): data += ",".join(map(str, row)) + "\n" logger.info(data) logger.info("Total time taken to run benchmark was: %s", date.compress(time.time() - benchmarking_start)) if errors: logger.error("Errors detected in run, dumping details:") logger.error(errors) except yaml.YAMLError as exc: print(exc)