def install_pipfiles(cwd, names, env, out, err): for name in names: if not name: continue path = (cwd / name).parents[0] vprint(3, "Converting to requirements.txt: {}".format(path)) requirements_txt = cwd.parents[0] / "requirements.txt" with open(str(requirements_txt), "wb") as outf: status, outdata, errdata = run_async_process( ". {}/etc/profile.d/conda.sh " "&& conda activate {} " "&& pipenv lock -r" .format(config.ANACONDA_PATH, env), outf, err, cwd=str(path) ) data = b"##<>##\nOutput:\n" + outdata + b"\n##<>##Error:\n" + errdata if status != 0: return (False, data) result, data = install_requirements( requirements_txt.parents[0], ["requirements.txt"], env, out, err ) if not result: return (False, data) return (True, b"")
def apply(session, status, skip_if_error, count, interval, reverse, check): """Extract markdown features""" filters = [ Cell.processed.op('&')(consts.C_PROCESS_OK) == 0, Cell.processed.op('&')(skip_if_error) == 0, Cell.cell_type == 'markdown', ] if interval: filters += [ Cell.repository_id >= interval[0], Cell.repository_id <= interval[1], ] query = ( session.query(Cell) .filter(*filters) ) if count: print(query.count()) return if reverse: query = query.order_by( Cell.repository_id.desc(), Cell.notebook_id.asc(), Cell.index.asc(), ) else: query = query.order_by( Cell.repository_id.asc(), Cell.notebook_id.asc(), Cell.index.asc(), ) repository_id = None notebook_id = None for cell in query: if check_exit(check): vprint(0, 'Found .exit file. Exiting') return status.report() if repository_id != cell.repository_id: session.commit() repository_id = cell.repository_id vprint(0, 'Processing repository: {}'.format(repository_id)) if notebook_id != cell.notebook_id: notebook_id = cell.notebook_id vprint(1, 'Processing notebook: {}'.format(notebook_id)) vprint(2, 'Processing cell: {}/[{}]'.format(cell.id, cell.index)) result = process_markdown_cell( session, repository_id, notebook_id, cell, skip_if_error ) vprint(2, result) status.count += 1 session.commit()
def load_repository(session, notebook, repository_id): if repository_id != notebook.repository_id: try: session.commit() except Exception as err: vprint(0, 'Failed to save modules from repository {} due to {}'.format( repository_id, err )) vprint(0, 'Processing repository: {}'.format(repository_id)) return notebook.repository_id return repository_id
def load_repository(session, cell, skip_repo, repository_id, repository, archives): if repository_id != cell.repository_id: repository = cell.repository_obj success, msg = session.commit() if not success: vprint( 0, 'Failed to save cells from repository {} due to {}'.format( repository, msg)) vprint(0, 'Processing repository: {}'.format(repository)) return False, cell.repository_id, repository, "todo" return skip_repo, repository_id, repository, archives
def pos_apply(dispatches, retry_errors, retry_timeout, verbose): """Dispatch execution to other python versions""" key = lambda x: x[1] dispatches = sorted(list(dispatches), key=key) for pyexec, disp in groupby(dispatches, key=key): vprint(0, "Dispatching to {}".format(pyexec)) extra = [] if retry_errors: extra.append("-e") if retry_timeout: extra.append("-t") extra.append("-n") notebook_ids = [x[0] for x in disp] while notebook_ids: ids = notebook_ids[:20000] args = extra + ids invoke(pyexec, "-u", __file__, "-v", verbose, *args) notebook_ids = notebook_ids[20000:]
def prepare_environment( session, env, mode, version, notebook_iter, mode_def, skip_env, notebook_exec_mode, dry_run, out, err ): vprint(0, "{}Preparing {} environment for Python {}".format( "[DRY RUN] " if dry_run >= 4 else "", 'anaconda' if mode.anaconda else 'raw python', version )) if dry_run >= 4: return True if not skip_env and not install_env(env, out, err): for notebook, repository in notebook_iter: nmode = notebook_exec_mode(mode_def, notebook, repository) notebook.processed |= nmode.processed * 2 session.add(notebook) session.commit() vprint(0, "Failed to prepare environment") return False return True
def install_requirements(cwd, names, env, out, err): for name in names: if not name: continue path = (cwd / name) vprint(3, "Installing {}".format(path)) status, outdata, errdata = run_async_process( ". {}/etc/profile.d/conda.sh " "&& conda activate {} " "&& pip install -r '{}'" .format( config.ANACONDA_PATH, env, str(path).replace("'", "'\\''"), ), out, err ) data = b"##<>##\nOutput:\n" + outdata + b"\n##<>##Error:\n" + errdata if status != 0: return (False, data) return (True, b"")
def load_notebook(session, cell, dispatches, repository, skip_repo, skip_notebook, notebook_id, archives, checker): if notebook_id != cell.notebook_id: notebook_id = cell.notebook_id notebook = cell.notebook_obj if not notebook.compatible_version: pyexec = get_pyexec(notebook.py_version, config.VERSIONS) if sys.executable != pyexec: dispatches.add((notebook.id, pyexec)) return skip_repo, True, cell.notebook_id, archives, None if archives == "todo": skip_repo, archives = load_archives(session, repository) if skip_repo: return skip_repo, skip_notebook, cell.notebook_id, archives, None if archives is None: return True, True, cell.notebook_id, archives, None vprint(1, 'Processing notebook: {}'.format(notebook)) name = to_unicode(notebook.name) tarzip, repo_path = archives notebook_path = os.path.join(repo_path, name) try: if isinstance(tarzip, set): checker = SetLocalChecker(tarzip, notebook_path) elif tarzip: checker = CompressedLocalChecker(tarzip, notebook_path) else: checker = PathLocalChecker(notebook_path) if not checker.exists(notebook_path): raise Exception( "Repository content problem. Notebook not found") return skip_repo, False, cell.notebook_id, archives, checker except Exception as err: vprint( 2, "Failed to load notebook {} due to {}".format(notebook, err)) return skip_repo, True, cell.notebook_id, archives, checker return skip_repo, skip_notebook, notebook_id, archives, checker
def collect_requirements(session, repository): if repository.path.exists(): vprint(2, "using path") setups, requirements, pipfiles, pipfile_locks = find_files_in_path( repository.path, ["setup.py", "requirements.txt", "Pipfile", "Pipfile.lock"]) changed = True elif repository.zip_path.exists(): vprint(2, "using zip") with tarfile.open(str(repository.zip_path)) as tarzip: setups, requirements, pipfiles, pipfile_locks = find_files_in_zip( tarzip, Path(repository.hash_dir2), ["setup.py", "requirements.txt", "Pipfile", "Pipfile.lock"]) changed = True else: vprint(2, "not found") repository.processed |= consts.R_UNAVAILABLE_FILES changed = False if changed: repository.setups_count = len(setups) repository.requirements_count = len(requirements) repository.pipfiles_count = len(pipfiles) repository.pipfile_locks_count = len(pipfile_locks) repository.setups = join_paths(setups) repository.requirements = join_paths(requirements) repository.pipfiles = join_paths(pipfiles) repository.pipfile_locks = join_paths(pipfile_locks) session.add(repository) session.commit()
def get_notebook(repository_id, notebook_id): with connect() as session: nbconvert_rdf = '' name = '' filters = [Repository.id == repository_id] repository = session.query(Repository).filter(*filters).first() notebook_filters = [ Notebook.id == notebook_id, Notebook.repository_id == repository_id ] notebook_query = session.query(Notebook).filter( *notebook_filters).first() name = notebook_query.name with mount_basedir(): if repository.path.exists(): execution_path = (config.EXECUTION_DIR / repository.hash_dir2) if os.path.exists(execution_path): notebook_path = execution_path else: notebook_path = repository.path try: with open(str(notebook_path / name)) as ofile: notebook = ofile.read() nbtordfconverter = nb2rdf.NBToRDFConverter() notebook_json = nbformat.reads(notebook, as_version=4) nbconvert_rdf = nbtordfconverter.convert_to_rdf( name, notebook_json) output_file_extension = 'ttl' output_file = os.path.join( repository.path, name + "." + output_file_extension) open(output_file, 'w').write(str(nbconvert_rdf)) return str(nbconvert_rdf), name except OSError as e: vprint(3, "Failed to open notebook {}".format(e)) return str(nbconvert_rdf), name
def load_archives(session, repository): if not repository.processed & consts.R_EXTRACTED_FILES: if repository.zip_path.exists(): vprint(1, 'Extracting files') result = process_repository(session, repository, skip_if_error=0) try: session.commit() if result != "done": raise Exception("Extraction failure. Fallback") vprint(1, result) except Exception as err: vprint(1, 'Failed: {}'.format(err)) try: tarzip = tarfile.open(str(repository.zip_path)) if repository.processed & consts.R_COMPRESS_ERROR: repository.processed -= consts.R_COMPRESS_ERROR session.add(repository) except tarfile.ReadError: repository.processed |= consts.R_COMPRESS_ERROR session.add(repository) return True, None zip_path = to_unicode(repository.hash_dir2) return False, (tarzip, zip_path) elif repository.path.exists(): repo_path = to_unicode(repository.path) return False, (None, repo_path) else: repository.processed |= consts.R_UNAVAILABLE_FILES session.add(repository) vprint(1, "Failed to load repository. Skipping") return True, None tarzip = { fil.path for fil in session.query(RepositoryFile).filter( RepositoryFile.repository_id == repository.id) } zip_path = "" if tarzip: return False, (tarzip, zip_path) return True, None
def apply( session, status, skip_if_error, count, interval, reverse, check ): """Extract code cell features""" filters = [ Notebook.processed.op("&")(consts.N_AGGREGATE_OK) == 0, Notebook.processed.op("&")(skip_if_error) == 0, Notebook.processed.op("&")(consts.N_GENERIC_LOAD_ERROR) == 0, ] if interval: filters += [ Notebook.repository_id >= interval[0], Notebook.repository_id <= interval[1], ] query = ( session.query(Notebook) .filter(*filters) ) if count: print(query.count()) return if reverse: query = query.order_by( Notebook.repository_id.desc(), Notebook.id.desc(), ) else: query = query.order_by( Notebook.repository_id.asc(), Notebook.id.asc(), ) repository_id = None for notebook in query: if check_exit(check): session.commit() vprint(0, 'Found .exit file. Exiting') return status.report() repository_id = load_repository(session, notebook, repository_id) vprint(1, 'Processing notebook: {}'.format(notebook)) result = process_notebook(session, notebook, skip_if_error) vprint(1, result) status.count += 1 session.commit()
def apply( session, status, skip_if_error, count, interval, reverse, check ): """Extract code cell features""" filters = [ Repository.processed.op('&')(consts.R_EXTRACTED_FILES) == 0, Repository.processed.op('&')(skip_if_error) == 0, Repository.processed.op('&')(consts.R_COMPRESS_OK) != 0, # Compressed ] if interval: filters += [ Repository.id >= interval[0], Repository.id <= interval[1], ] query = ( session.query(Repository) .filter(*filters) ) if count: print(query.count()) return if reverse: query = query.order_by( Repository.id.desc(), ) else: query = query.order_by( Repository.id.asc(), ) for repository in query: if check_exit(check): session.commit() vprint(0, 'Found .exit file. Exiting') return status.report() vprint(0, 'Processing repository: {}'.format(repository)) with mount_basedir(): result = process_repository( session, repository, skip_if_error, ) vprint(1, result) status.count += 1 session.commit()
def apply( session, status, selected_repositories, skip_if_error, count, interval, reverse, check ): while selected_repositories: filters = [ Repository.processed.op("&")(consts.R_N_EXTRACTION) == 0, # no extraction Repository.processed.op("&")(skip_if_error) == 0, # no failure ] if selected_repositories is not True: filters += [ Repository.id.in_(selected_repositories[:30]) ] selected_repositories = selected_repositories[30:] else: selected_repositories = False if interval: filters += [ Repository.id >= interval[0], Repository.id <= interval[1], ] query = session.query(Repository).filter(*filters) if count: print(query.count()) return if reverse: query = query.order_by( Repository.id.desc() ) else: query = query.order_by( Repository.id.asc() ) for repository in query: if check_exit(check): vprint(0, "Found .exit file. Exiting") return status.report() vprint(0, "Extracting notebooks/cells from {}".format(repository)) with mount_basedir(): result = process_repository(session, repository, skip_if_error) vprint(0, result) status.count += 1 session.commit()
def apply(session, status, count, interval, reverse, check): """Extract code cell features""" filters = [CellModule.local_possibility.is_(None)] if interval: filters += [ CellModule.repository_id >= interval[0], CellModule.repository_id <= interval[1], ] query = (session.query(CellModule).filter(*filters)) if count: print(query.count()) return if reverse: query = query.order_by( CellModule.repository_id.desc(), CellModule.id.desc(), ) else: query = query.order_by( CellModule.repository_id.asc(), CellModule.id.asc(), ) skip_repo = False repository_id = None archives = None for cell_module in query: if check_exit(check): session.commit() vprint(0, 'Found .exit file. Exiting') return status.report() skip_repo, repository_id, archives = load_repository( session, cell_module, skip_repo, repository_id, archives) if skip_repo: continue vprint(1, 'Processing module: {}'.format(cell_module)) result = process_cell_module(session, cell_module, archives) vprint(1, result) status.count += 1 session.commit()
def execute_notebooks( status, session, cwd, notebooks_iter, mode, notebook_exec_mode, dry_run ): notebooks_iter = list(notebooks_iter) vprint(2, "{}Running {} notebooks".format( "[DRY RUN] " if dry_run >= 1 else "", len(notebooks_iter) )) if dry_run >= 1: return "done" for notebook, repository in notebooks_iter: status.count += 1 status.report() nmode = notebook_exec_mode(mode, notebook, repository) if notebook.processed & (nmode.processed * 2): notebook.processed -= nmode.processed * 2 mode_num = exec_to_num(*nmode) vprint(2, "Running notebook {}".format(notebook)) pstatus = subprocess.call( '. {}/etc/profile.d/conda.sh ' '&& conda activate {} ' "&& python reproducemegit/jupyter_reproducibility/run_notebook.py -n {} -p '{}' -m {}" .format( config.ANACONDA_PATH, "work", notebook.id, str(cwd / notebook.name).replace("'", "'\\''"), mode_num ), shell=True, ) error = pstatus != 0 processed = nmode.processed * (2 if error else 1) vprint(2, "Status: {}. Mode: {}. Set Processed: {}".format( pstatus, mode_num, processed )) notebook.processed |= processed session.add(notebook) session.commit() return "done"
def apply(session, status, selected_repositories, processed, no, count, interval, reverse, check): while selected_repositories: filters = [ Repository.processed.op("&")(processed) == processed, # no extraction Repository.processed.op("&")(no) == 0, # no failure ] if selected_repositories is not True: filters += [Repository.id.in_(selected_repositories[:30])] selected_repositories = selected_repositories[30:] else: selected_repositories = False if interval: filters += [ Repository.id >= interval[0], Repository.id <= interval[1], ] query = session.query(Repository).filter(*filters) if count: print(query.count()) return if reverse: query = query.order_by(Repository.id.desc()) else: query = query.order_by(Repository.id.asc()) for repository in query: if check_exit(check): vprint(0, "Found .exit file. Exiting") return status.report() vprint(0, "Unzipping {}".format(repository)) with mount_basedir(): result = unzip_repository(session, repository) vprint(1, result) status.count += 1 session.commit()
def execute_repository( status, session, repository, notebooks_iter, mode, env, skip_extract, notebook_exec_mode, dry_run, out, err, ): vprint(1, "Executing notebooks from {}".format(repository)) if repository.processed & consts.R_UNAVAILABLE_FILES: repository.processed -= consts.R_UNAVAILABLE_FILES session.add(repository) if repository.processed & consts.R_COMPRESS_ERROR: repository.processed -= consts.R_COMPRESS_ERROR session.add(repository) cwd = config.EXECUTION_DIR / repository.hash_dir2 vprint(2, "{}Preparing repository directory".format( "[DRY RUN] " if dry_run >= 3 else "", )) if dry_run < 3: with mount_umount(out, err): success, cwd, msg = extract_repository( session, repository, skip_extract, out, err ) vprint(3, msg) if not success: return "Failed to extract repository" if mode.dependencies: msg = install_repository_dependencies( status, session, cwd, repository, notebooks_iter, mode, env, notebook_exec_mode, dry_run, out, err ) if msg is not None: return msg return execute_notebooks( status, session, cwd, notebooks_iter, mode, notebook_exec_mode, dry_run )
def load_repository(session, cell_module, skip_repo, repository_id, archives): if repository_id != cell_module.repository_id: repository = cell_module.repository_obj try: session.commit() except Exception as err: vprint( 0, 'Failed to save modules from repository {} due to {}'.format( repository_id, err)) vprint(0, 'Processing repository: {}'.format(repository)) if not repository.processed & consts.R_EXTRACTED_FILES: vprint(1, 'Skipping. Files not extracted from repository') return True, cell_module.repository_id, None archives = { fil.path for fil in session.query(RepositoryFile).filter( RepositoryFile.repository_id == repository.id) } return False, cell_module.repository_id, archives return skip_repo, repository_id, archives
def apply(session, status, keep, count, interval, reverse, check): """Compress repositories""" filters = [ Repository.processed.op('&')(consts.R_COMPRESS_OK) == 0, ] if interval: filters += [ Repository.id >= interval[0], Repository.id <= interval[1], ] query = session.query(Repository).filter(*filters) if count: print(query.count()) return if reverse: query = query.order_by(Repository.id.desc()) else: query = query.order_by(Repository.id.asc()) for repository in query: if check_exit(check): vprint(0, "Found .exit file. Exiting") return status.report() vprint(0, "Compressing {}".format(repository)) vprint(1, "Into {}".format(repository.zip_path)) with mount_basedir(): try: if repository.path.exists(): commit = repository.get_commit() if commit != repository.commit: repository.processed |= consts.R_COMMIT_MISMATCH repository.processed |= consts.R_COMPRESS_ERROR if repository.zip_path.exists() or repository.compress(): if repository.processed & consts.R_COMPRESS_ERROR: repository.processed -= consts.R_COMPRESS_ERROR if not keep: shutil.rmtree(str(repository.path), ignore_errors=True) elif not repository.zip_path.exists(): if repository.processed & consts.R_COMPRESS_ERROR: repository.processed -= consts.R_COMPRESS_ERROR if not repository.path.exists(): repository.processed |= consts.R_UNAVAILABLE_FILES vprint(1, "failed") if repository.zip_path.exists(): vprint(1, "ok") repository.processed |= consts.R_COMPRESS_OK except Exception as err: vprint(1, "Failed: {}".format(err)) session.add(repository) status.count += 1 session.commit()
def load_notebook(repository_id, path, notebook_file, nbrow): """Extract notebook information and cells from notebook""" # pylint: disable=too-many-locals status = 0 try: with open(str(path / notebook_file)) as ofile: notebook = nbf.read(ofile, nbf.NO_CONVERT) nbrow["nbformat"] = "{0[nbformat]}".format(notebook) if "nbformat_minor" in notebook: nbrow["nbformat"] += ".{0[nbformat_minor]}".format(notebook) notebook = nbf.convert(notebook, 4) metadata = notebook["metadata"] except OSError as e: vprint(3, "Failed to open notebook {}".format(e)) nbrow["processed"] = consts.N_LOAD_ERROR if os.path.islink(str(path / notebook_file)): import textwrap vprint(3, "Notebook is broken link. Use the following SQL to fix:") text = (textwrap.dedent("""\ select notebooks_count, (char_length(newtext) - char_length(replace(newtext, '''', ''))), concat( 'update repositories ', 'set notebooks_count = ', (char_length(newtext) - char_length(replace(newtext, ';', ''))) + 1, ', notebooks = ''', newtext, ''' where id = ', id, ';' ) from ( select id, notebooks_count, replace( replace( replace( notebooks, '{0};', '' ), ';{0}', '' ), '''', '''''' ) as newtext from repositories where id = {1} ) as foo; """.format(notebook_file, repository_id))) text = " ".join(x.strip() for x in text.split("\n")) print(text) return nbrow, [] except Exception as e: # pylint: disable=broad-except vprint(3, "Failed to load notebook {}".format(e)) nbrow["processed"] = consts.N_LOAD_FORMAT_ERROR return nbrow, [] nbrow["kernel"] = metadata.get("kernelspec", {}).get("name", "no-kernel") language_info = metadata.get("language_info", {}) nbrow["language"] = language_info.get("name", "unknown") nbrow["language_version"] = language_info.get("version", "unknown") shell = InteractiveShell.instance() is_python = nbrow["language"] == "python" is_unknown_version = nbrow["language_version"] == "unknown" cells = notebook["cells"] cells_info = [] exec_count = -1 for index, cell in enumerate(cells): vprint(3, "Loading cell {}".format(index)) cell_exec_count = cell.get("execution_count") or -1 if isinstance(cell_exec_count, str) and cell_exec_count.isdigit(): cell_exec_count = int(cell_exec_count) if isinstance(cell_exec_count, int): exec_count = max(exec_count, cell_exec_count) output_formats = ";".join(set(cell_output_formats(cell))) cell_processed = consts.C_OK if is_unknown_version: cell_processed = consts.C_UNKNOWN_VERSION try: source = cell["source"] = cell["source"] or "" if is_python and cell.get("cell_type") == "code": try: source = shell.input_transformer_manager.transform_cell(source) except (IndentationError, SyntaxError) as err: vprint(3, "Error on cell transformation: {}".format(err)) source = "" status = consts.N_LOAD_SYNTAX_ERROR cell_processed |= consts.C_SYNTAX_ERROR if "\0" in source: vprint(3, "Found null byte in source. Replacing it by \\n") source = source.replace("\0", "\n") cellrow = { "repository_id": repository_id, "notebook_id": None, "index": index, "cell_type": cell.get("cell_type", "<unknown>"), "execution_count": cell.get("execution_count"), "lines": cell["source"].count("\n") + 1, "output_formats": output_formats, "source": source, "python": is_python, "processed": cell_processed, } cells_info.append(cellrow) nbrow["total_cells"] += 1 if cell.get("cell_type") == "code": nbrow["code_cells"] += 1 if output_formats: nbrow["code_cells_with_output"] += 1 elif cell.get("cell_type") == "markdown": nbrow["markdown_cells"] += 1 elif cell.get("cell_type") == "raw": nbrow["raw_cells"] += 1 else: nbrow["unknown_cell_formats"] += 1 if not cell["source"].strip(): nbrow["empty_cells"] += 1 except KeyError as err: vprint(3, "Error on cell extraction: {}".format(err)) status = consts.N_LOAD_FORMAT_ERROR if nbrow["total_cells"] == 0: status = consts.N_LOAD_FORMAT_ERROR nbrow["max_execution_count"] = exec_count nbrow["processed"] = status return nbrow, cells_info
def load_repository(session, domain, repo, check_repo_only=True, branch=None, commit=None, clone_existing=False): """Clone repository and extract its information""" vprint(0, "Processing repository: {}".format(repo)) if check_repo_only: repository = session.query(Repository).filter( Repository.domain == domain, Repository.repository == repo, ).first() if repository is not None: vprint(1, "Repository exists: ID={}".format(repository.id)) if not clone_existing: return repository part, end = extract_hash_parts(repo) remote = get_remote(domain, repo) vprint(1, "Remote: {}".format(remote)) full_dir = clone(part, end, repo, remote, branch, commit) commit = git_output( "rev-parse", "HEAD", cwd=str(full_dir) ).decode("utf-8").strip() repository = session.query(Repository).filter( Repository.domain == domain, Repository.repository == repo, Repository.commit == commit, ).first() if repository is not None: if not check_repo_only: vprint(1, "Repository exists: ID={}".format(repository.id)) # vprint(1, "> Removing .git directory") # shutil.rmtree(str(repository.path / ".git"), ignore_errors=True) return repository vprint(1, "Finding files") notebooks = [ file.relative_to(full_dir) for file in find_files(full_dir, "*.ipynb") if ".ipynb_checkpoints" not in str(file) ] setups, requirements, pipfiles, pipfile_locks = find_files_in_path( full_dir, [ "setup.py", "requirements.txt", "Pipfile", "Pipfile.lock" ] ) repository = Repository( domain=domain, repository=repo, hash_dir1=part, hash_dir2=end, commit=commit, notebooks_count=len(notebooks), setups_count=len(setups), requirements_count=len(requirements), pipfiles_count=len(pipfiles), pipfile_locks_count=len(pipfile_locks), notebooks=join_paths(notebooks), setups=join_paths(setups), requirements=join_paths(requirements), pipfiles=join_paths(pipfiles), pipfile_locks=join_paths(pipfile_locks), processed=consts.R_OK, ) session.add(repository) session.commit() # vprint("Removing .git directory") # shutil.rmtree(str(repository.path / ".git"), ignore_errors=True) vprint(1, "Done. ID={}".format(repository.id)) return repository
def process_code_cell( session, repository_id, notebook_id, cell, checker, skip_if_error=consts.C_PROCESS_ERROR, skip_if_syntaxerror=consts.C_SYNTAX_ERROR, skip_if_timeout=consts.C_TIMEOUT, ): """Process Markdown Cell to collect features""" if cell.processed & consts.C_PROCESS_OK: return 'already processed' retry = False retry |= not skip_if_error and cell.processed & consts.C_PROCESS_ERROR retry |= not skip_if_syntaxerror and cell.processed & consts.C_SYNTAX_ERROR retry |= not skip_if_timeout and cell.processed & consts.C_TIMEOUT if retry: deleted = (session.query(CellFeature).filter( CellFeature.cell_id == cell.id).delete() + session.query(CellModule).filter( CellModule.cell_id == cell.id).delete() + session.query(CellName).filter( CellName.cell_id == cell.id).delete() + session.query(CodeAnalysis).filter( CodeAnalysis.cell_id == cell.id).delete()) if deleted: vprint(2, "Deleted {} rows".format(deleted)) if cell.processed & consts.C_PROCESS_ERROR: cell.processed -= consts.C_PROCESS_ERROR if cell.processed & consts.C_SYNTAX_ERROR: cell.processed -= consts.C_SYNTAX_ERROR if cell.processed & consts.C_TIMEOUT: cell.processed -= consts.C_TIMEOUT session.add(cell) try: error = False try: vprint(2, "Extracting features") analysis, modules, features, names = extract_features( cell.source, checker) processed = consts.A_OK except TimeoutError: processed = consts.A_TIMEOUT cell.processed |= consts.C_TIMEOUT error = True except SyntaxError: processed = consts.A_SYNTAX_ERROR cell.processed |= consts.C_SYNTAX_ERROR error = True if error: vprint(3, "Failed: {}".format(processed)) analysis = { x.name: 0 for x in CodeAnalysis.__table__.columns if x.name not in {"id", "repository_id", "notebook_id", "cell_id", "index"} } analysis["ast_others"] = "" modules = [] features = [] names = {} else: vprint(3, "Ok") analysis["processed"] = processed code_analysis = CodeAnalysis(repository_id=repository_id, notebook_id=notebook_id, cell_id=cell.id, index=cell.index, **analysis) dependents = [] for line, import_type, module_name, local in modules: dependents.append( CellModule( repository_id=repository_id, notebook_id=notebook_id, cell_id=cell.id, index=cell.index, line=line, import_type=import_type, module_name=module_name, local=local, )) for line, column, feature_name, feature_value in features: dependents.append( CellFeature( repository_id=repository_id, notebook_id=notebook_id, cell_id=cell.id, index=cell.index, line=line, column=column, feature_name="IPython/" + feature_name, feature_value=feature_value, )) for (scope, context), values in names.items(): for name, count in values.items(): dependents.append( CellName( repository_id=repository_id, notebook_id=notebook_id, cell_id=cell.id, index=cell.index, scope=scope, context=context, name=name, count=count, )) vprint(2, "Adding session objects") session.dependent_add(code_analysis, dependents, "analysis_id") cell.processed |= consts.C_PROCESS_OK return "done" except Exception as err: cell.processed |= consts.C_PROCESS_ERROR if config.VERBOSE > 4: import traceback traceback.print_exc() return 'Failed to process ({})'.format(err) finally: session.add(cell)
def apply(session, status, dispatches, selected_notebooks, skip_if_error, skip_if_syntaxerror, skip_if_timeout, count, interval, reverse, check): """Extract code cell features""" while selected_notebooks: filters = [ Cell.processed.op('&')(consts.C_PROCESS_OK) == 0, Cell.processed.op('&')(skip_if_error) == 0, Cell.processed.op('&')(skip_if_syntaxerror) == 0, Cell.processed.op('&')(skip_if_timeout) == 0, Cell.processed.op('&')( consts.C_UNKNOWN_VERSION) == 0, # known version Cell.cell_type == 'code', Cell.python.is_(True), ] if selected_notebooks is not True: filters += [Cell.notebook_id.in_(selected_notebooks[:30])] selected_notebooks = selected_notebooks[30:] else: selected_notebooks = False if interval: filters += [ Cell.repository_id >= interval[0], Cell.repository_id <= interval[1], ] query = (session.query(Cell).filter(*filters)) if count: print(query.count()) return if reverse: query = query.order_by( Cell.repository_id.desc(), Cell.notebook_id.asc(), Cell.index.asc(), ) else: query = query.order_by( Cell.repository_id.asc(), Cell.notebook_id.asc(), Cell.index.asc(), ) skip_repo = False repository_id = None repository = None archives = None skip_notebook = False notebook_id = None checker = None for cell in query: if check_exit(check): session.commit() vprint(0, 'Found .exit file. Exiting') return status.report() with mount_basedir(): skip_repo, repository_id, repository, archives = load_repository( session, cell, skip_repo, repository_id, repository, archives) if skip_repo: continue skip_repo, skip_notebook, notebook_id, archives, checker = load_notebook( session, cell, dispatches, repository, skip_repo, skip_notebook, notebook_id, archives, checker) if skip_repo or skip_notebook: continue vprint(2, 'Processing cell: {}'.format(cell)) result = process_code_cell( session, repository_id, notebook_id, cell, checker, skip_if_error, skip_if_syntaxerror, skip_if_timeout, ) vprint(2, result) status.count += 1 session.commit()
def process_repository(session, repository, skip_if_error=consts.R_N_ERROR): """Process repository""" if repository.processed & (consts.R_N_EXTRACTION + skip_if_error): return "already processed" if repository.processed & consts.R_N_ERROR: session.add(repository) repository.processed -= consts.R_N_ERROR count = 0 for name in repository.notebook_names: if not name: continue count += 1 notebook = session.query(Notebook).filter( Notebook.repository_id == repository.id, Notebook.name == name, ).first() if notebook is not None: if notebook.processed & consts.N_STOPPED: session.delete(notebook) session.commit() else: if notebook.processed & consts.N_GENERIC_LOAD_ERROR: count -= 1 vprint(2, "Notebook already exists. Delete from DB: {}".format(notebook)) with open(str(config.LOGS_DIR / "todo_delete"), "a") as f: f.write("{},".format(notebook.id)) continue # Skip working notebook if not repository.path.exists(): vprint(2, "Unzipping repository: {}".format(repository.zip_path)) msg = unzip_repository(session, repository) if msg != "done": vprint(2, msg) return "failed" try: vprint(2, "Loading notebook {}".format(name)) nbrow = { "repository_id": repository.id, "name": name, "nbformat": 0, "kernel": "no-kernel", "language": "unknown", "language_version": "unknown", "max_execution_count": 0, "total_cells": 0, "code_cells": 0, "code_cells_with_output": 0, "markdown_cells": 0, "raw_cells": 0, "unknown_cell_formats": 0, "empty_cells": 0, "processed": consts.N_OK, } try: nbrow, cells = load_notebook(repository.id, repository.path, name, nbrow) except TimeoutError: nbrow["processed"] = consts.N_LOAD_TIMEOUT cells = [] nbrow["processed"] |= consts.N_STOPPED notebook = Notebook(**nbrow) session.dependent_add( notebook, [Cell(**cellrow) for cellrow in cells], "notebook_id" ) except Exception as err: # pylint: disable=broad-except repository.processed |= consts.R_N_ERROR session.add(repository) vprint(1, "Failed to load notebook {} due {!r}".format(name, err)) if config.VERBOSE > 4: import traceback traceback.print_exc() if not repository.processed & consts.R_N_ERROR and count == repository.notebooks_count: repository.processed |= consts.R_N_EXTRACTION session.add(repository) status, err = session.commit() if not status: if repository.processed & consts.R_N_EXTRACTION: repository.processed -= consts.R_N_EXTRACTION if not repository.processed & consts.R_N_ERROR: repository.processed += consts.R_N_ERROR session.add(repository) session.commit() return "Failed due {!r}".format(err) return "done"
def install_repository_dependencies( status, session, cwd, repository, notebooks_iter, mode, env, notebook_exec_mode, dry_run, out, err ): vprint(2, "{}Installing repository dependencies".format( "[DRY RUN] " if dry_run >= 2 else "" )) if dry_run >= 2: return None install_options = [ ("setup.py", install_setups, repository.setup_names), ("requirements.txt", install_requirements, repository.requirement_names), ("Pipfile", install_pipfiles, repository.pipfile_names), ("Pipfile.lock", install_pipfiles, repository.pipfile_lock_names), ] installed = True data_ok_list = [] data_failed_list = [] data_failed = b"" for spec, func, names in install_options: success, data = func(cwd, names, "work", out, err) installed = installed and success spec_bytes = spec.encode("utf-8") if success: data_ok_list.append(spec_bytes) else: data_failed += b"\n##<<>>##" + spec_bytes + b":\n" + data data_failed_list.append(spec_bytes) if not installed: reason = "<Install Dependency Error>" cause = b"Ok: " + b", ".join(data_ok_list) cause += b"\n##<<>>##Failed: " + b", ".join(data_failed_list) cause += data_failed for notebook, repository in notebooks_iter: status.skipped += 1 status.report() nmode = notebook_exec_mode(mode, notebook, repository) mode_num = exec_to_num(*nmode) execution = session.query(Execution).filter( Execution.notebook_id == notebook.id, Execution.mode == mode_num, ).first() if execution: if execution.processed & consts.E_EXECUTED: continue execution.reason = reason execution.msg = cause execution.cell = None execution.count = None execution.diff = None execution.duration = None execution.timeout = None execution.diff_count = None execution.processed = consts.E_CREATED else: execution = Execution( notebook_id=notebook.id, mode=mode_num, reason=reason, msg=cause, processed=consts.E_CREATED, repository_id=notebook.repository_id, ) session.add(execution) notebook.processed |= nmode.processed session.add(notebook) session.commit() return "Failed to install {}".format( b", ".join(data_failed_list).decode("utf-8") ) return None
def process_requirement_file(session, repository, reqformat, skip_if_error=consts.R_REQUIREMENTS_ERROR): """Process requirement file""" MAP = { "setup.py": "setup", "requirements.txt": "requirement", "Pipfile": "pipfile", "Pipfile.lock": "pipfile_lock" } zip_path = None tarzip = None if not repository.path.exists(): if not repository.zip_path.exists(): repository.processed |= consts.R_UNAVAILABLE_FILES session.add(repository) vprint( 1, "Failed to load requirement {} due <repository not found>". format(reqformat)) return False tarzip = tarfile.open(str(repository.zip_path)) zip_path = Path(repository.hash_dir2) finished = True req_param = MAP[reqformat] + "_names" for name in getattr(repository, req_param): if not name: continue try: vprint(2, "Loading requirement {}".format(name)) if tarzip: content = tarzip.extractfile( tarzip.getmember(str(zip_path / name))).read() else: with open(str(repository.path / name), "rb") as ofile: content = ofile.read() coding = chardet.detect(content) if coding["encoding"] is None: vprint(3, "Codec not detected") continue try: content = content.decode(coding['encoding']) except UnicodeDecodeError: vprint(3, "Invalid codec") continue if '\0' in content: vprint(3, "NULL byte in content") continue requirement_file = RequirementFile( repository_id=repository.id, name=name, reqformat=reqformat, content=content, processed=consts.F_OK, ) session.add(requirement_file) except Exception as err: repository.processed |= skip_if_error session.add(repository) vprint(1, "Failed to load requirement {} due {!r}".format(name, err)) if config.VERBOSE > 4: import traceback traceback.print_exc() finished = False if tarzip: tarzip.close() return finished
def extract_repository(session, repository, skip_extract, out, err): cwd = config.EXECUTION_DIR if skip_extract: cwd = (config.EXECUTION_DIR / repository.hash_dir2) if not cwd.exists(): return ( False, cwd, "Failed to use extracted dir. It does not exists" ) else: try: if config.EXECUTION_DIR.exists(): shutil.rmtree(str(config.EXECUTION_DIR), ignore_errors=True) if repository.zip_path.exists(): config.EXECUTION_DIR.mkdir(parents=True, exist_ok=True) cmd = repository.uncompress(config.EXECUTION_DIR, return_cmd=True) vprint(3, "Extract: {}".format(repository.zip_path)) vprint(3, "Command: {}".format(" ".join(cmd))) uncompressed = subprocess.call(cmd, stdout=out, stderr=err) if uncompressed != 0: repository.processed |= consts.R_COMPRESS_ERROR session.commit() return ( False, cwd, "Extraction failed with code {}".format(uncompressed), ) elif repository.path.exists(): new_path = (config.EXECUTION_DIR / repository.hash_dir2) new_path.mkdir(parents=True, exist_ok=True) cmd = "tar cf - * | (cd {} ; tar xf - )".format(str(new_path)) vprint(3, "Copy: {}".format(repository.path)) vprint(3, "Command: {}".format(cmd)) copied = subprocess.call( cmd, shell=True, stdout=out, stderr=err, cwd=str(repository.path) ) if copied != 0: repository.processed |= consts.R_COMPRESS_ERROR session.commit() return ( False, cwd, "Copying failed with code {}".format(copied), ) else: repository.processed |= consts.R_UNAVAILABLE_FILES session.add(repository) session.commit() return ( False, cwd, "Failed to find repository" ) files = [sub for sub in cwd.glob("*")] sub_cwd = cwd / repository.hash_dir2 if files == [sub_cwd]: cwd = sub_cwd else: return ( False, cwd, "Execution dir is full" ) except Exception as e: repository.processed |= consts.R_COMPRESS_ERROR session.add(repository) session.commit() return ( False, cwd, "Copy failed with exception {}".format(e), ) commit = repository.get_commit(cwd) if commit != repository.commit: repository.processed |= consts.R_COMMIT_MISMATCH session.add(repository) return ( False, cwd, "Commit mismatch. Expected {}. Found {}".format( repository.commit, commit ), ) return ( True, cwd, "Repository set to {}".format(cwd) )
def apply( session, repository_id, status, script_name, execution_mode, with_execution, with_dependency, skip_if_error, skip_if_error_mode, skip_if_troublesome, try_to_discover_files, skip_env, skip_extract, dry_run, mode_rules, notebook_exec_mode, count, interval, reverse, check ): """Execute repositories""" mode_def = None if execution_mode == -1 else EXECUTION_MODE[execution_mode] filters = [ Notebook.language == "python", Notebook.language_version != "unknown", func.length(Notebook.language_version) > 3, Repository.processed.op('&')(try_to_discover_files) == 0, Repository.processed.op('&')(consts.R_FAILED_TO_CLONE) == 0, Repository.processed.op('&')(skip_if_error) == 0, Repository.processed.op('&')(skip_if_troublesome) == 0, Repository.id == repository_id, ] if interval: filters += [ Repository.id >= interval[0], Repository.id <= interval[-1] ] filters += EXECUTION_RULES[with_execution] filters += DEPENDENCY_RULES[with_dependency] if mode_def is None: filters += mode_rules( with_execution, with_dependency, skip_if_error_mode ) else: filters.append( Notebook.processed.op('&')( mode_def.processed * skip_if_error_mode ) == 0 ) query = ( session.query(Notebook, Repository) .join(Repository) .filter(*filters) ) if count: print(query.count()) return if reverse: query = query.order_by( (Repository.setups_count + Repository.requirements_count + Repository.pipfile_locks_count + Repository.pipfiles_count) > 0, Notebook.language_version.asc(), Repository.id.desc() ) else: query = query.order_by( (Repository.setups_count + Repository.requirements_count + Repository.pipfile_locks_count + Repository.pipfiles_count) > 0, Notebook.language_version.asc(), Repository.id.asc() ) moment = datetime.now().strftime("%Y%m%dT%H%M%S") config.LOGS_DIR.mkdir(parents=True, exist_ok=True) outf = str(config.LOGS_DIR / ("sub-{}-{}.out".format(script_name, moment))) errf = str(config.LOGS_DIR / ("sub-{}-{}.err".format(script_name, moment))) with open(outf, "wb") as out, open(errf, "wb") as err: group = groupby( query, lambda x: ( x[0].language_version[:3], notebook_exec_mode(mode_def, *x) ) ) last = None for (version, mode), query_iter in group: status.report() vnum = version_string_to_list(version) envs = config.VERSIONS if mode.anaconda else config.RAW_VERSIONS env = best_match(vnum, envs) group = groupby( query_iter, lambda x: (x[1]) ) for repository, notebook_iter in group: if check_exit(check): vprint(0, "Found .exit file. Exiting") return current = (env, repository) if mode.dependencies else env if last != current: prepared = prepare_environment( session, env, mode, version, notebook_iter, mode_def, skip_env, notebook_exec_mode, dry_run, out, err ) if not prepared: continue last = None if mode.dependencies else current result = execute_repository( status, session, repository, notebook_iter, mode, env, skip_extract, notebook_exec_mode, dry_run, out, err ) vprint(2, result) session.commit()