def run_tool(tool_name: str, wav_file: Path, aux_file: any, output_file: Path): tmp_subdir = Path(mkdtemp(dir=tmp_dir)) cmd = [ 'bash', str(speech_tools_path / 'tools' / tool_name / 'run.sh'), '--dist-path', str(speech_tools_path / 'dist'), '--tmp-path', str(tmp_subdir), str(wav_file) ] if aux_file: cmd.append(str(aux_file)) cmd.append(str(output_file)) with open(str(output_file) + '_log.txt', 'w') as log: logger.info(f'Running {" ".join(cmd)}') try: run(cmd, stdout=log, stderr=STDOUT, check=True, cwd=speech_tools_path) except CalledProcessError: raise RuntimeError(f'error running script for {output_file}') finally: if tmp_subdir.exists(): rmtree(str(tmp_subdir)) if not output_file.exists(): raise RuntimeError(f'{output_file} missing')
def package(work_dir: Path, project_id: str, db) -> Path: proj = db.clarin.emu.find_one({'_id': ObjectId(project_id)}) if not proj: raise RuntimeError('project not found') if 'deleted' in proj: raise RuntimeError('project deleted') dir = Path(mkdtemp(suffix='_emuDB', dir=work_dir)) proj_name = str(dir.name)[:-6] logger.info(f'Saving CTM in {dir} (zip)...') config = get_config(proj_name, feats) with open(dir / f'{proj_name}_DBconfig.json', 'w') as f: json.dump(config, f, indent=4) sessions = {} for bundle_id, bundle in proj['bundles'].iteritems(): if 'audio' not in bundle or 'seg' not in bundle: continue b = { 'name': bundle['name'], 'audio': get_file(db, bundle['audio'], work_dir), 'ctm': get_file(db, bundle['seg'], work_dir) } if not b['audio'] or not b['ctm']: continue sess = bundle['session'] if sess not in sessions: sessions[sess] = [] sessions[sess].append(b) for sess, bndls in sessions.items(): sess_dir = dir / f'{sess}_ses' sess_dir.mkdir() for bndl in bndls: bndl_dir = sess_dir / f'{bndl["name"]}_bndl' bndl_dir.mkdir() bndl_basnam = bndl_dir / bndl['name'] shutil.copy(bndl['audio'], bndl_basnam.with_suffix('.wav')) # save_annot(bndl['ctm'], bndl_basnam + u'_annot.json', bndl['name']) annot = segmentation_to_emu_annot(bndl['ctm'], bndl['name']) with open(bndl_basnam.with_suffix('_annot.json'), 'w') as f: json.dump(annot, f, indent=4) run_feat(feats, bndl_basnam.with_suffix('.wav')) make_archive(dir, dir.with_suffix('.zip')) shutil.rmtree(dir) return dir.with_suffix('.zip')
def normalize(task: Dict[str, any]) -> Path: file = work_dir / task['input'] with NamedTemporaryFile(dir=work_dir, suffix='.txt', delete=False) as fout: output = Path(fout.name) logger.info(f'Normalizing text file {file} -> {fout.name}') with open(str(work_dir / file)) as fin: for line in fin: line = line.lower() line = pat.sub(' ', line) line = num.sub(' ', line) line = ws.sub(' ', line) fout.write(line) return output.relative_to(work_dir)
def ffmpeg(task: Dict[str, any]) -> Path: file = work_dir / task['input'] with NamedTemporaryFile(dir=work_dir, suffix='.wav') as f: tmp = Path(f.name) cmd = ['ffmpeg', '-y', '-i', dir / file, '-acodec', 'pcm_s16le', '-ac', '1', '-ar', '16k', str(tmp)] logger.info(u'Running {}'.format(' '.join(cmd))) try: with open(str(tmp) + '_ffmpeg.log', 'w') as f: run(cmd, stdout=f, stderr=STDOUT, check=True) except: raise RuntimeError('error in call cmd -- check ' + str(tmp) + '_ffmpeg.log') if tmp.exists(): return tmp.relative_to(work_dir) else: raise RuntimeError('error in ffmpeg (no output file) -- check ' + str(tmp) + '_ffmpeg.log')
def run(): while True: from pymongo import MongoClient db = MongoClient(host=db_host)[db_name] # if 'tasks' not in db.workers.list_collection_names(): # db.create_collection('tasks', capped=True, max=max_task_history, # size=max_task_history * ave_task_size) logger.info('Worker queue waiting...') while True: sleep(1) task_data = db.tasks.find_one_and_update( filter={'$and': [{ 'in_progress': False }, { 'done': False }]}, update={'$set': { 'in_progress': True }}, sort=[('time', ASCENDING)]) if not task_data: continue task_type = task_data['task'] logger.info(f'Performing {task_type}...') set = {'done': True, 'in_progress': False} if task_type in tasks_map: run = tasks_map[task_type] try: result = run(task_data) set['result'] = str(result) except RuntimeError as e: set['error'] = str(e) else: logger.error(f'Unknown task: {task_type}') set['error'] = f'Unknown task: {task_type}' db.tasks.update_one({'_id': ObjectId(task_data['_id'])}, {'$set': set})