def apply(session, status, keep, count, interval, reverse, check): """Compress repositories""" filters = [ Repository.processed.op('&')(consts.R_COMPRESS_OK) == 0, ] if interval: filters += [ Repository.id >= interval[0], Repository.id <= interval[1], ] query = session.query(Repository).filter(*filters) if count: print(query.count()) return if reverse: query = query.order_by(Repository.id.desc()) else: query = query.order_by(Repository.id.asc()) for repository in query: if check_exit(check): vprint(0, "Found .exit file. Exiting") return status.report() vprint(0, "Compressing {}".format(repository)) vprint(1, "Into {}".format(repository.zip_path)) with mount_basedir(): try: if repository.path.exists(): commit = repository.get_commit() if commit != repository.commit: repository.processed |= consts.R_COMMIT_MISMATCH repository.processed |= consts.R_COMPRESS_ERROR if repository.zip_path.exists() or repository.compress(): if repository.processed & consts.R_COMPRESS_ERROR: repository.processed -= consts.R_COMPRESS_ERROR if not keep: shutil.rmtree(str(repository.path), ignore_errors=True) elif not repository.zip_path.exists(): if repository.processed & consts.R_COMPRESS_ERROR: repository.processed -= consts.R_COMPRESS_ERROR if not repository.path.exists(): repository.processed |= consts.R_UNAVAILABLE_FILES vprint(1, "failed") if repository.zip_path.exists(): vprint(1, "ok") repository.processed |= consts.R_COMPRESS_OK except Exception as err: vprint(1, "Failed: {}".format(err)) session.add(repository) status.count += 1 session.commit()
def apply( session, status, skip_if_error, count, interval, reverse, check ): """Extract code cell features""" filters = [ Repository.processed.op('&')(consts.R_EXTRACTED_FILES) == 0, Repository.processed.op('&')(skip_if_error) == 0, Repository.processed.op('&')(consts.R_COMPRESS_OK) != 0, # Compressed ] if interval: filters += [ Repository.id >= interval[0], Repository.id <= interval[1], ] query = ( session.query(Repository) .filter(*filters) ) if count: print(query.count()) return if reverse: query = query.order_by( Repository.id.desc(), ) else: query = query.order_by( Repository.id.asc(), ) for repository in query: if check_exit(check): session.commit() vprint(0, 'Found .exit file. Exiting') return status.report() vprint(0, 'Processing repository: {}'.format(repository)) with mount_basedir(): result = process_repository( session, repository, skip_if_error, ) vprint(1, result) status.count += 1 session.commit()
def github_crawler(github_url): status = None count = None interval = None reverse = None check = 'all' keep_uncompressed = 'False' dispatches = set() script_name = None skip_env = False skip_extract = 0 dry_run = 0 status = StatusLogger(script_name) status.report() with connect() as session, mount_basedir(), savepid(): repository = load_repository.load_repository_from_url( session, github_url) s1_notebooks_and_cells.apply( SafeSession(session, interrupted=consts.N_STOPPED), status, [repository.id] or True, consts.R_N_ERROR, count, interval, reverse, set(check)) s2_requirement_files.apply(session, status, [repository.id] or True, consts.R_REQUIREMENTS_ERROR, count, interval, reverse, set(check)) s3_compress.apply(session, status, keep_uncompressed, count, interval, reverse, set(check)) s4_markdown_features.apply(session, status, consts.C_PROCESS_ERROR, count, interval, reverse, set(check)) s5_extract_files.apply(session, status, consts.R_COMPRESS_ERROR, count, interval, reverse, set(check)) s6_cell_features.apply(SafeSession(session), status, dispatches, True, consts.C_PROCESS_ERROR, consts.C_SYNTAX_ERROR, consts.C_TIMEOUT, count, interval, reverse, set(check)) result = s7_execute_repositories.apply( session, repository.id, status, script_name, config.EXECUTION_MODE, config.WITH_EXECUTION, config.WITH_DEPENDENCY, consts.R_COMPRESS_ERROR, 3, consts.R_TROUBLESOME, consts.R_UNAVAILABLE_FILES, skip_env, skip_extract, dry_run, mode_rules, s7_execute_repositories.notebook_exec_mode, count, interval, reverse, set(check)) p0_local_possibility.apply(session, status, count, interval, reverse, set(check)) p1_notebook_aggregate.apply(session, status, consts.N_AGGREGATE_ERROR, count, interval, reverse, set(check)) p2_sha1_exercises.apply(session, status, count, interval, reverse, set(check)) return repository.id
def apply( session, status, selected_repositories, skip_if_error, count, interval, reverse, check ): while selected_repositories: filters = [ Repository.processed.op("&")(consts.R_N_EXTRACTION) == 0, # no extraction Repository.processed.op("&")(skip_if_error) == 0, # no failure ] if selected_repositories is not True: filters += [ Repository.id.in_(selected_repositories[:30]) ] selected_repositories = selected_repositories[30:] else: selected_repositories = False if interval: filters += [ Repository.id >= interval[0], Repository.id <= interval[1], ] query = session.query(Repository).filter(*filters) if count: print(query.count()) return if reverse: query = query.order_by( Repository.id.desc() ) else: query = query.order_by( Repository.id.asc() ) for repository in query: if check_exit(check): vprint(0, "Found .exit file. Exiting") return status.report() vprint(0, "Extracting notebooks/cells from {}".format(repository)) with mount_basedir(): result = process_repository(session, repository, skip_if_error) vprint(0, result) status.count += 1 session.commit()
def main(): """Main function""" parser = argparse.ArgumentParser(description="Load Repository by URL") parser.add_argument("url", type=str, help="repository URL") parser.add_argument("-v", "--verbose", type=int, default=config.VERBOSE, help="increase output verbosity") parser.add_argument("-b", "--branch", type=str, help="specific branch") parser.add_argument("-c", "--commit", type=str, help="specific commit") parser.add_argument("-e", "--clone-existing", action='store_true', help="clone even if repository exists") args = parser.parse_args() config.VERBOSE = args.verbose with connect() as session, mount_basedir(), savepid(): load_repository_from_url( session, args.url, args.branch, args.commit, args.clone_existing )
def apply(session, status, selected_repositories, processed, no, count, interval, reverse, check): while selected_repositories: filters = [ Repository.processed.op("&")(processed) == processed, # no extraction Repository.processed.op("&")(no) == 0, # no failure ] if selected_repositories is not True: filters += [Repository.id.in_(selected_repositories[:30])] selected_repositories = selected_repositories[30:] else: selected_repositories = False if interval: filters += [ Repository.id >= interval[0], Repository.id <= interval[1], ] query = session.query(Repository).filter(*filters) if count: print(query.count()) return if reverse: query = query.order_by(Repository.id.desc()) else: query = query.order_by(Repository.id.asc()) for repository in query: if check_exit(check): vprint(0, "Found .exit file. Exiting") return status.report() vprint(0, "Unzipping {}".format(repository)) with mount_basedir(): result = unzip_repository(session, repository) vprint(1, result) status.count += 1 session.commit()
def get_notebook(repository_id, notebook_id): with connect() as session: nbconvert_rdf = '' name = '' filters = [Repository.id == repository_id] repository = session.query(Repository).filter(*filters).first() notebook_filters = [ Notebook.id == notebook_id, Notebook.repository_id == repository_id ] notebook_query = session.query(Notebook).filter( *notebook_filters).first() name = notebook_query.name with mount_basedir(): if repository.path.exists(): execution_path = (config.EXECUTION_DIR / repository.hash_dir2) if os.path.exists(execution_path): notebook_path = execution_path else: notebook_path = repository.path try: with open(str(notebook_path / name)) as ofile: notebook = ofile.read() nbtordfconverter = nb2rdf.NBToRDFConverter() notebook_json = nbformat.reads(notebook, as_version=4) nbconvert_rdf = nbtordfconverter.convert_to_rdf( name, notebook_json) output_file_extension = 'ttl' output_file = os.path.join( repository.path, name + "." + output_file_extension) open(output_file, 'w').write(str(nbconvert_rdf)) return str(nbconvert_rdf), name except OSError as e: vprint(3, "Failed to open notebook {}".format(e)) return str(nbconvert_rdf), name
def apply(session, status, dispatches, selected_notebooks, skip_if_error, skip_if_syntaxerror, skip_if_timeout, count, interval, reverse, check): """Extract code cell features""" while selected_notebooks: filters = [ Cell.processed.op('&')(consts.C_PROCESS_OK) == 0, Cell.processed.op('&')(skip_if_error) == 0, Cell.processed.op('&')(skip_if_syntaxerror) == 0, Cell.processed.op('&')(skip_if_timeout) == 0, Cell.processed.op('&')( consts.C_UNKNOWN_VERSION) == 0, # known version Cell.cell_type == 'code', Cell.python.is_(True), ] if selected_notebooks is not True: filters += [Cell.notebook_id.in_(selected_notebooks[:30])] selected_notebooks = selected_notebooks[30:] else: selected_notebooks = False if interval: filters += [ Cell.repository_id >= interval[0], Cell.repository_id <= interval[1], ] query = (session.query(Cell).filter(*filters)) if count: print(query.count()) return if reverse: query = query.order_by( Cell.repository_id.desc(), Cell.notebook_id.asc(), Cell.index.asc(), ) else: query = query.order_by( Cell.repository_id.asc(), Cell.notebook_id.asc(), Cell.index.asc(), ) skip_repo = False repository_id = None repository = None archives = None skip_notebook = False notebook_id = None checker = None for cell in query: if check_exit(check): session.commit() vprint(0, 'Found .exit file. Exiting') return status.report() with mount_basedir(): skip_repo, repository_id, repository, archives = load_repository( session, cell, skip_repo, repository_id, repository, archives) if skip_repo: continue skip_repo, skip_notebook, notebook_id, archives, checker = load_notebook( session, cell, dispatches, repository, skip_repo, skip_notebook, notebook_id, archives, checker) if skip_repo or skip_notebook: continue vprint(2, 'Processing cell: {}'.format(cell)) result = process_code_cell( session, repository_id, notebook_id, cell, checker, skip_if_error, skip_if_syntaxerror, skip_if_timeout, ) vprint(2, result) status.count += 1 session.commit()