def main(): data_dir = '/tmp/mnist' model_dir = '/tmp/model' batch_size = 128 delete_dir(model_dir) data_format = ('channels_first' if tf.test.is_built_with_cuda() else 'channels_last') params = { 'data_format': data_format, 'learning_rate': 1e-4 } estimator = tf.estimator.Estimator( model_fn=model_fn, model_dir=model_dir, params=params) def train_input_fn(): ds = dataset.train(data_dir) ds = ds.cache() ds = ds.shuffle(buffer_size=50000) ds = ds.batch(batch_size) ds = ds.repeat(1) return ds def eval_input_fn(): ds = dataset.test(data_dir) ds = ds.batch(batch_size) return ds print('Train model') train_hooks = [tf.train.LoggingTensorHook(tensors=[z'cross_entropy', 'train_accuracy'], every_n_iter=20)]
def create_data(output_dir, source_train, source_test, source_dev, gen_batch=64, length=128, device=0): delete_dir(output_dir) os.makedirs(output_dir) for datatype, source_path in zip(["train", "test", "dev"], [source_train, source_test, source_dev]): with open(source_path) as f: data = f.read() data = data.split("<|endoftext|>") data = data[1:-1] len_ = len(data) data = [text.replace("\n", "").replace("\t", "") for text in data] steps = int(len_ / gen_batch) + 1 texts = [] for _ in trange(steps): texts += generate_text( model, tokenizer, prompt="", length=length, num_return_sequences=gen_batch, device=device, ) texts = texts[:len_] # Cleaning the strings: texts = [text.replace("\n", "").replace("\t", "") for text in texts] total_data = [] labels = [0] * len_ + [1] * len_ import random random.shuffle(labels) for label in labels: if label: total_data.append(data.pop()) else: total_data.append(texts.pop()) df = pd.DataFrame({"sentence": total_data, "label": labels}) if datatype == "test": df["sentence"].to_csv(join(output_dir, "test.tsv"), sep="\t", index_label="index") df["label"].to_csv(join(output_dir, "test_answers.tsv"), sep="\t", index_label="index") else: df.to_csv(join(output_dir, datatype + ".tsv"), sep="\t", index=False)
def tearDownClass(self): """ clean up :return: """ self.logging.info('--> TestFilesystems.tearDownClass()') self.logging.debug('delete the partition created in setup class') utils.delete_dir(self.mountpt) utils.delete_dir(self.nfs_mount_pt) utils.del_eckd_partition(self.dev) self.logging.info('<-- TestFilesystems.tearDownClass()')
def rouge(preds_file, targets_file): temp_targets_dir, temp_preds_dir = './temp_targets/', './temp_preds/' summaries_to_rouge_format(targets_file, temp_targets_dir, "targets") summaries_to_rouge_format(preds_file, temp_preds_dir, "preds") os.system("""python -m rouge.rouge \ --target_filepattern={}*.targets \ --prediction_filepattern={}*.preds \ --output_filename=rouge_scores.csv""".format( temp_targets_dir, temp_preds_dir)) delete_dir(temp_targets_dir) delete_dir(temp_preds_dir)
def download(self): utils.delete_dir(self.base_path) if not self.dict_result: return info_df = pd.DataFrame(self.dict_result.values(), index=self.dict_result.keys()) if len(info_df) > 0: dir_path = f'{self.base_path}/{settings.xbrl_dir_name}{settings.since}/' self.__make_directory(dir_path) self.__download_all_xbrl_files(info_df, dir_path)
def train( dataloader, output_dir, epochs=6, log_steps=200, learning_rate=5e-6, fp16=True, debug_stop=False, device=0, optimizer=None, lr_scheduler=None, ): if optimizer is None: optimizer, lr_scheduler = create_optimizer_and_scheduler( model, dataloader, epochs, learning_rate=learning_rate) delete_dir(output_dir) for epoch in range(epochs): total_loss = 0 for step, inputs in enumerate(tqdm(dataloader), 1): model.train() _prepare_inputs(inputs, device) if fp16: total_loss += fp16_train_step(model, inputs) / log_steps fp16_optimizer_step(model, optimizer) else: total_loss += train_step(model, inputs) / log_steps optimizer_step(model, optimizer) lr_scheduler.step() model.zero_grad() # Logging if not ((step) % log_steps): print( f"step: {step} (lr = {optimizer.param_groups[0]['lr']}), loss: {total_loss}" ) total_loss = 0 if debug_stop: break eval_loss = evaluate(testloader, fp16=fp16, device=0) perplexity = float(math.exp(eval_loss)) print("perplexity:", perplexity) save_dir = '{}_ep{}_perplexity{}'.format(int(time()), epoch, perplexity) save(model, save_dir, optimizer, lr_scheduler, output_dir)
def main(): data_dir = '/tmp/mnist' model_dir = '/tmp/model' batch_size = 128 use_dataset = True delete_dir(model_dir) def train_input_fn(): ds = dataset.train(data_dir) ds = ds.repeat(None) ds = ds.shuffle(buffer_size=50000) ds = ds.batch(batch_size) return ds def eval_input_fn(): ds = dataset.test(data_dir) ds = ds.batch(batch_size) return ds model_params = { 'learning_rate': 1e-4, 'hidden_size': 512, 'keep_rate': 0.5 } estimator = tf.estimator.Estimator(model_fn=model_fn, model_dir=model_dir, params=model_params) for iter in range(100): print('Train model') train_hooks = [ tf.train.LoggingTensorHook( tensors=['global_step', 'cross_entropy'], every_n_iter=1), SampleHook(loss_tensor='cross_entropy', checkpoint_path=model_dir) ] estimator.train(input_fn=train_input_fn, steps=200, hooks=train_hooks) print('Evaluate model') eval_hooks = [ tf.train.LoggingTensorHook(tensors=['global_step'], every_n_iter=1) ] eval_results = estimator.evaluate(input_fn=eval_input_fn, hooks=eval_hooks) print('Eval loss: %s' % eval_results['loss']) print('Eval accuracy: %s' % eval_results['accuracy'])
def uploadPackage(self, request_iterator, context): """ 上传app包 :param request_iterator: :param context: :return: """ LOG.info('receive upload package msg...') res = UploadPackageResponse(status=utils.FAILURE) parameters = UploadPackageRequest(request_iterator) host_ip = validate_input_params(parameters) if host_ip is None: parameters.delete_tmp() return res app_package_id = parameters.app_package_id if app_package_id is None: LOG.debug('appPackageId is required') parameters.delete_tmp() return res app_package_path = utils.APP_PACKAGE_DIR + '/' + host_ip + '/' + parameters.app_package_id if utils.exists_path(app_package_path): LOG.debug('app package exist') parameters.delete_tmp() return res utils.create_dir(app_package_path) try: LOG.debug('unzip package') with zipfile.ZipFile(parameters.tmp_package_file_path) as zip_file: namelist = zip_file.namelist() for file in namelist: zip_file.extract(file, app_package_path) pkg = CsarPkg(app_package_path) pkg.translate() res.status = utils.SUCCESS except Exception as exception: LOG.error(exception, exc_info=True) utils.delete_dir(app_package_path) finally: parameters.delete_tmp() return res
def backup_info(request, cluster_id, path=None): if path is None: path = "/home/backup/" path = u.join_path(path, cluster_id) try: u.create_dir(path) ct_id, ukp_id, default_image_id, instance_ids = save_cluster_info(request, cluster_id, path) node_groups_template_ids = save_cluster_template_info(request, ct_id, path) flavor_ids, image_ids, security_group_ids = save_node_groups_info(request, node_groups_template_ids, path) image_ids = append_to_list(default_image_id, image_ids) save_key_pair(request, ukp_id, path) save_flavors_info(request, flavor_ids, path) save_security_group_info(request, security_group_ids, path) save_images_tag(request, image_ids, path) #backup_instance(request, instance_ids, path) except Exception: u.delete_dir(path) return False else: return True
def create_dataset( data_source, output_dir, required_in_title=None, upvoted=False, skip_first=0, bad_words=None, ): with open(data_source) as f: data = json.load(f) delete_dir(output_dir) makedirs(output_dir) data = [x for x in data if ("selftext" not in x) or (not x["selftext"])] if required_in_title is not None: for required in required_in_title: data = [x for x in data if required in x["title"].lower()] if upvoted: data = [x for x in data if x["score"] > 1] data = [x["title"] for x in data] data = data[skip_first:] # Removing the first 1000 older examples # Trying to remove most prompts that directly ask to write or describe something if bad_words is not None: for word in bad_words: data = [x for x in data if (word not in x.lower())] data = [remove_tags(x) for x in data] data = list(set(data)) # removing reposts split_1 = int(len(data) * .9) split_2 = int(len(data) * .95) random.shuffle(data) data_train = data[:split_1] print("Train dataset of length", len(data_train)) data_test = data[split_1:split_2] print("Test dataset of length", len(data_test)) data_val = data[split_2:] print("Dev dataset of length", len(data_val)) write(join(output_dir, "train.txt"), data_train) write(join(output_dir, "test.txt"), data_test) write(join(output_dir, "dev.txt"), data_val) print("data saved in", join(getcwd(), output_dir))
def deletePackage(self, request, context): """ 删除app包 :param request: :param context: :return: """ LOG.info('receive delete package msg...') res = DeletePackageResponse(status=utils.FAILURE) host_ip = validate_input_params(request) if host_ip is None: return res app_package_id = request.appPackageId if not app_package_id: return res app_package_path = utils.APP_PACKAGE_DIR + '/' + host_ip + '/' + app_package_id utils.delete_dir(app_package_path) res.status = utils.SUCCESS return res
def draw_document_distribution(trending_topics, count_topics, total, domain): domain_nor = domain.replace(u' ', u'-').lower() output_dir = os.path.join(u'static', domain_nor) utils.delete_dir(output_dir) utils.mkdir(output_dir) objects = [] for k in xrange(len(count_topics)): try: if len(count_topics) >= 50: _ = trending_topics[k] objects.append(unicode(k)) except: objects.append(u'') performance = map(lambda x: x * 100, count_topics) y_pos = np.arange(len(objects)) plt.bar(y_pos, performance, align='center', alpha=0.5) plt.xticks(y_pos, objects) plt.ylabel('percent') plt.title('Document distribution by topics - num_docs = %d' % (total)) # plt.show() plt.tight_layout(pad=0.4, w_pad=1.4, h_pad=1.0) plt.savefig(os.path.join(output_dir, 'documents_distribution.png'), dpi=100)
def tearDown(self): utils.delete_dir("")
def get_download(branch, download_store_path, list_products): site_url = 'http://lisbon-build.pentaho.com/hosted/' + branch + '/latest/' file_build_info = 'build.info' ## # READ LATEST DOWNLOAD BUILD FROM LOCAL FILE # 1. read local file # 2. read latest download build ## # Download all files # Download build info file - to know the current version download_build_info = site_url + file_build_info log.debug(download_build_info) try: response = urllib.request.urlopen(download_build_info, timeout=50) except Exception as e: log.debug('Something went wrong.') log.debug(e) sys.exit('Something went wrong. Download Build.Info.') data = response.read().decode('utf-8') data = data.replace('\n', ' ') log.debug('Content: ' + data) build_info_version = data.split(' ')[0] downloaded_file = [] if list_products: list_download_artifacts = list_products else: list_download_artifacts = glist_download_artifacts log.debug(list_products) log.debug(list_download_artifacts) for filename in list_download_artifacts: try: require_version_check = False need_download_new_build = True log.debug('-------') log.debug('DOWNLOADING [' + filename + ']') log.debug('-------') latest_version = 0 # 1. read local file latest_build_file_path = os.path.join(os.environ['TMP'], 'pentahobuildinfo', branch, filename) latest_build_file_path = os.path.normpath(latest_build_file_path) os.makedirs(latest_build_file_path, exist_ok=True) latest_build_file_path = os.path.join( os.path.normpath(latest_build_file_path), 'last_download_build.txt') # 1.1. If the local file doesn't exist, then we need to download a new build if not os.path.isfile(latest_build_file_path): log.debug('File does not exist.') else: require_version_check = True log.info('File exist: [' + latest_build_file_path + '].') # Need to read file and get version number. with open(latest_build_file_path, "r") as file_handler: latest_version = file_handler.read().replace('\n', '').replace( ' ', '') log.debug('Previous saved build was [' + latest_version + ']') # 2. Download the "Build.info". log.debug("Require New Version Checker [" + str(require_version_check) + '].') log.debug("Download Build Info version [" + build_info_version + '].') log.debug("Last downloaded version [" + str(latest_version) + '].') if require_version_check and (latest_version == build_info_version): log.debug( 'We do not need to download a new version. Latest [' + latest_version + '] and Current [' + build_info_version + ']') need_download_new_build = False if need_download_new_build: download_fail = False log.debug('We are going to download a new version [' + build_info_version + ']') # Create directory if does not exist or delete all contents of it download_store_path = os.path.realpath(download_store_path) log.debug('Writing content to this directory [' + download_store_path + ']') if not os.path.exists(download_store_path): log.debug('Create directory [' + download_store_path + ']') os.makedirs(download_store_path) # Let's download the files. store_directory = os.path.join(download_store_path, filename) # Going to delete the folder of the artifact and the zip file tmp_store_filename = os.path.join(download_store_path, filename) + '.zip' log.debug('Deleting previous artifacts.') # delete the folder - unzipped previously e.g. pentaho-server-ce log.debug('Delete store directory [' + store_directory + ']') utils.delete_dir(store_directory) # delete the download zip previously e.g. pentaho-server-ce.zip log.debug('Delete zip file [' + tmp_store_filename + ']') utils.delete_file(tmp_store_filename) download_url = site_url + filename + '.zip' log.debug('Downloading file: [' + filename + '] [' + download_url + '].') wget.download(download_url, download_store_path) log.debug('Download completed!') # Need to save in the file the download version # We are using the CONTENT MANAGER that close the stream for us with open(latest_build_file_path, "w+") as text_file: print(build_info_version, file=text_file) log.debug('Save on file [' + latest_build_file_path + '] the latest build version [' + build_info_version + '].') utils.unzip_single_file(download_store_path, filename) except Exception as e: download_fail = True log.exception(e) break
suite = unittest.TestSuite() result = unittest.TestResult() runner = unittest.TextTestRunner(verbosity=2) #------------------------------------------------------------------- # run tests in the listed modules #------------------------------------------------------------------- moduleNames = """ test_list test_redirect test_read test_write test_cross_origin test_browser """.split() modules = [__import__(moduleName) for moduleName in moduleNames] for module in modules: suite.addTest(unittest.defaultTestLoader.loadTestsFromModule(module)) server = utils.Server() server.start() try: runner.run(suite) finally: server.stop() utils.delete_dir("")
# start root_dir = os.getcwd() token = read_file_contents('token') github = 'https://api.github.com/repos/KanoComputing/{}/tarball/{}' for name, branch in repos_selected: url = github.format(name, branch) dir_str = '{}___{}'.format(name, branch) dir_path = os.path.join(root_dir, dir_str) debfile = '' print if args.down: print 'Downloading {} ...'.format(dir_str) delete_dir(dir_path) ensure_dir(dir_path) os.chdir(dir_path) if not token: cmd = 'curl -L -v -o tmp.tgz {url}'.format(url=url) else: cmd = 'curl -H "Authorization: token {token}" -L -v -o tmp.tgz {url}'.format( token=token, url=url) _, e, _ = run_cmd(cmd) if args.verbose: print e if '< Status: 302 Found' in e: print 'OK'
def reset(self): utils.delete_dir(self.result_dir)
EXCLUDE_DIRS = [ '.vscode', 'node_modules', 'log', '.git', 'dist', '.idea', '.nvmrc', '.DS_Store', '__pycache__', 'coverage', '.nyc_output' ] EXCLUDE_FILES = [ '.gitignore', '.cfignore', 'package-lock.json', '.directory', 'README.md', '.nyc_output' ] EXCLUDE_EXTENSION = [ '.jpg', '.png', '.jpeg', '.md', '.swap', '.opts', '.log', '.svg', '.pdf', '.otf', '.ttf', '.eot', '.woff2', '.woff', '.gif', '.psd', '.xls', '.xlsx' ] CLONE_DIR = './repositories' OUTPUT_DIR = './output/' delete_dir(dir_path=CLONE_DIR) delete_dir(dir_path=OUTPUT_DIR) create_dir(dir_path=CLONE_DIR) create_dir(dir_path=OUTPUT_DIR) for project in PROJECTS: project_base_dirs = [] project_base_files = [] project_exclude_files = EXCLUDE_FILES + project["exclude_extension"] project_exclude_extension = EXCLUDE_EXTENSION + project["exclude_extension"] clone_repository_dir = os.path.join(CLONE_DIR, project["name"]) git_clone(CLONE_DIR, project["repository"]) for item in os.listdir(clone_repository_dir): item_path = os.path.join(clone_repository_dir, item) if os.path.isdir(item_path):
def delete_tmp(self): """ 删除临时文件 """ utils.delete_dir(self._tmp_package_dir)
def main(): data_dir = '/tmp/mnist' model_dir = '/tmp/model' batch_size = 128 use_dataset = True delete_dir(model_dir) if use_dataset: # Use `tf.data.Dataset` to read train and eval data. def train_input_fn(): ds = dataset.train(data_dir) ds = ds.cache() ds = ds.shuffle(buffer_size=50000) ds = ds.batch(batch_size) ds = ds.repeat(1) return ds def eval_input_fn(): ds = dataset.test(data_dir) ds = ds.batch(batch_size) return ds else: # Use `numpy_input_fn()` to read train and evaluation data # from Numpy arrays. mnist = input_data.read_data_sets(data_dir) train_input_fn = tf.estimator.inputs.numpy_input_fn( x={'X': mnist.train.images}, y=mnist.train.labels.astype(np.int32), num_epochs=1, batch_size=batch_size, shuffle=True) eval_input_fn = tf.estimator.inputs.numpy_input_fn( x={'X': mnist.test.images}, y=mnist.test.labels.astype(np.int32), num_epochs=1, batch_size=batch_size, shuffle=False) model_params = { 'learning_rate': 1e-4, 'hidden_size': 512, 'keep_rate': 0.5 } estimator = tf.estimator.Estimator(model_fn=model_fn, model_dir=model_dir, params=model_params) print('Train model') train_hooks = [ tf.train.LoggingTensorHook( tensors=['learning_rate', 'cross_entropy', 'train_accuracy'], every_n_iter=20) ] estimator.train(input_fn=train_input_fn, hooks=train_hooks) print('Evaluate model') eval_results = estimator.evaluate(input_fn=eval_input_fn) print('Eval loss: %s' % eval_results['loss']) print('Eval accuracy: %s' % eval_results['accuracy']) print('Generate some predictions:') preds = estimator.predict(input_fn=eval_input_fn) for _ in range(5): print(preds.__next__()['class'])
def delete(self, name): name = normalize_name(name) dataset_path = self.datasets_dir + name delete_dir(dataset_path) return
root_dir = os.getcwd() token = read_file_contents('token') github = 'https://api.github.com/repos/KanoComputing/{}/tarball/{}' for name, branch in repos_selected: url = github.format(name, branch) dir_str = '{}___{}'.format(name, branch) dir_path = os.path.join(root_dir, dir_str) debfile = '' print if args.down: print 'Downloading {} ...'.format(dir_str) delete_dir(dir_path) ensure_dir(dir_path) os.chdir(dir_path) if not token: cmd = 'curl -L -v -o tmp.tgz {url}'.format(url=url) else: cmd = 'curl -H "Authorization: token {token}" -L -v -o tmp.tgz {url}'.format(token=token, url=url) _, e, _ = run_cmd(cmd) if args.verbose: print e if '< Status: 302 Found' in e: print 'OK' else:
def kaldi_stt(file_path, transcript=None, std_bash=False, tmp_dir=TMP_DIR, model_dir=s5_path): """ Kaldi speech to text decoding """ nj = 1 beam = 14 lat_beam = 6 sr = 8000 file_path_old = file_path[-4] + ".old.wav" os.rename(file_path, file_path_old) speaker_id = file_path.rsplit("/", 1)[1] speaker_dir = os.path.join(tmp_dir, speaker_id) wav_data_dir = os.path.join(speaker_dir, "data") wav_path = os.path.join(wav_data_dir, speaker_id) create_dir(wav_data_dir) transform_audio_file(file_path_old, wav_path, rate=sr) wavscp_path = os.path.join(speaker_dir, "wav.scp") with open(wavscp_path, "w") as scp_file: scp_file.write(f"{speaker_id} {wav_path}\n") utt2spk_path = os.path.join(speaker_dir, "utt2spk") with open(utt2spk_path, "w") as scp_file: scp_file.write(f"{speaker_id} {speaker_id}\n") spk2utt_path = os.path.join(speaker_dir, "spk2utt") with open(spk2utt_path, "w") as scp_file: scp_file.write(f"{speaker_id} {speaker_id}\n") if transcript: text_path = os.path.join(speaker_dir, "text") with open(text_path, "w") as scp_file: scp_file.write(f"{speaker_id} {transcript}\n") speaker_dir = f"{TMP_DIR}/{speaker_id}" decode_dir = f"{model_dir}/exp/tri5_ali/{speaker_id}" export = f""" export nj={nj} export beam={beam} export lat_beam={lat_beam} export KALDI_ROOT={KALDI_PATH} export s5_path={model_dir} export decode_dir="{TMP_DIR}/{speaker_id}" export model_dir="{model_dir}/exp/tri5_ali" export decode_res_dir="{model_dir}/exp/tri5_ali/{speaker_id}" [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 . $KALDI_ROOT/tools/config/common_path.sh export LC_ALL=C """ make_features = """ cd $s5_path steps/make_mfcc.sh --nj $nj $decode_dir $decode_dir/log/mfcc steps/compute_cmvn_stats.sh $decode_dir $decode_dir/log/mfcc """ if transcript: decode = """ if [ -d decode_res_dir ]; then rm -rf $decode_res_dir fi steps/decode.sh --skip-scoring false --beam $beam --lattice-beam $lat_beam --nj $nj $model_dir/graph/ $decode_dir $decode_res_dir """ else: decode = """ if [ -d decode_res_dir ]; then rm -rf $decode_res_dir fi steps/decode.sh --skip-scoring false --beam $beam --lattice-beam $lat_beam --nj $nj $model_dir/graph/ $decode_dir $decode_res_dir """ extract_res = """ find ${decode_res_dir} -name lat*.gz -exec bash -c \ 'lattice-best-path --acoustic-scale=0.085 --word-symbol-table=exp/tri5_ali/graph/words.txt ark:"gunzip -c {} |" ark,t:${decode_res_dir}/one-best.tra_$(basename ${0/gz/txt})' {} \; cat ${decode_res_dir}/one-best*.txt >> ${decode_res_dir}/all.txt utils/int2sym.pl -f 2- exp/tri5_ali/graph/words.txt ${decode_res_dir}/all.txt > ${decode_res_dir}/best_hyp.txt """ p = subprocess.Popen(export + make_features + decode + extract_res, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() if transcript: with open( os.path.join(model_dir, "exp", "tri5_ali", speaker_id, 'scoring_kaldi', 'best_wer')) as file_: wer = file_.readlines() wer = re.findall(r"(.*)\]\s+", wer[0])[0].strip() + ']' with open( os.path.join(model_dir, "exp", "tri5_ali", speaker_id, 'scoring_kaldi', 'wer_details', 'per_utt')) as file_: wer_details = file_.readlines() wer_details = "".join(wer_details) tb = "\n\n".join([stdout.decode(), stderr.decode()]) if std_bash: print(tb) with open(f"{decode_dir}/best_hyp.txt", 'r', encoding='utf-8') as f: res = f.readlines() thrash_folders = (speaker_dir, decode_dir) [delete_dir(folder) for folder in thrash_folders] try: res = res[0].split(" ", 1)[1], "Success!" except IndexError: res = tb, "Failed!" logging.error(f"Kaldi traceback:\n{tb}") # return res if transcript: return res, wer, wer_details else: return res
def reset_all(self): utils.delete_dir(self.domain_output_dir)
def setUp(self): print self.client = utils.Client() utils.delete_dir("") utils.create_dir("")