def non_executed_prop(nb_id): # get number of code cells and non-executed code cells num_code_cells = len(data.get_code_cells(nb_id)) num_non_executed = count_non_exec(nb_id) # calculate proportion return float(num_non_executed) / float(num_code_cells)
def has_export(nb_id): # get code cells code_cells = data.get_code_cells(nb_id) # check each code cell for an export for cell in code_cells: if exports(cell): return True return False
def has_param_import(nb_id): # get the code cells code_cells = data.get_code_cells(nb_id) # look for papermill import for cell in code_cells: if has_import(cell, 'papermill') or \ has_import(cell, 'parameterized'): return True return False
def num_functions(nb_id): # get code cells code_cells = data.get_code_cells(nb_id) # count all the functions for each cell num_defs = 0 for cell in code_cells: num_defs += def_in_cell(cell) # return the total return num_defs
def has_testing(nb_id): # get the code cells code_cells = data.get_code_cells(nb_id) # look for testing imports for cell in code_cells: if has_import(cell, 'pytest') or \ has_import(cell, 'test') or \ has_import(cell, 'unittest'): return True return False
def count_images(nb_id): # get code cells output_cells = data.get_code_cells(nb_id) # for each code cell, checks the outputs if they have an image image_outputs = 0 has_image = False for cell in output_cells: # check that the outputs field is present, move to next cell if not if 'outputs' not in cell.keys(): continue for output in cell['outputs']: # check if output has output type if 'output_type' not in output.keys(): continue # field associated with displaying an image if output['output_type'] == "display_data": # double-check that an image is actually being displayed keys = output.keys() if "png" in keys: image_outputs += 1 has_image = True elif "data" in keys: if "image/png" in output['data'].keys(): image_outputs += 1 has_image = True # fields associated with displaying a table elif 'data' in output.keys( ) and 'text/html' in output['data'].keys(): # double-check that a table is actually being displayed for line in output['data']['text/html']: if "</table>" in line: image_outputs += 1 has_image = True break # if image already found, stop checking this output cell if has_image: has_image = False break return image_outputs
def output_cells(nb_id): # get code cells code_cells = data.get_code_cells(nb_id) # filter and get the length of the filtered list def condition(cell): try: return len(cell['outputs']) > 0 except: return False output_cells = list(filter(condition, code_cells)) return len(output_cells)
def has_param(nb_id): # checks for imports if has_param_import(nb_id): return True # get code cells code_cells = data.get_code_cells(nb_id) # check for manual parameterization in the first five code cells for cell in code_cells[:5]: if is_param_cell(cell): return True return False
def forwards_prop(nb_id): # get code cells that have been executed code_cells = data.get_code_cells(nb_id) ex_code_cells = list(filter(lambda cell: get_exec(cell) > 0, code_cells)) # if no (or only one) code cells have been executed, return immediately if len(ex_code_cells) <= 1: return None # get number of backwards steps and number of steps back_steps = count_forwards(nb_id) steps = len(ex_code_cells) - 1 return float(back_steps) / float(steps)
def count_forwards(nb_id): # get code cells that have been executed code_cells = data.get_code_cells(nb_id) ex_code_cells = list(filter(lambda cell: get_exec(cell) > 0, code_cells)) # iterate through and count the number of times execution order goes backwards backsteps = 0 for (i, cell) in enumerate(ex_code_cells): # if not on the last cell, check the next cell if i != len(ex_code_cells) - 1: # count if execution order goes forwards if get_exec(cell) < get_exec(ex_code_cells[i + 1]): backsteps += 1 return backsteps
def has_error(nb_id): # get code cells code_cells = data.get_code_cells(nb_id) # filter down to those that have been executed ex_code_cells = list(filter(lambda cell: get_exec(cell) > 0, code_cells)) # iterate through and check outputs for cell in ex_code_cells: # check outputs if 'outputs' in cell.keys(): for output in cell['outputs']: if 'output_type' in output.keys() and \ (output['output_type'] == "error" or output['output_type'] == "pyerr"): return True return False
def ex_skip_average(nb_id): # get code cells that have been executed code_cells = data.get_code_cells(nb_id) ex_code_cells = list(filter(lambda cell: get_exec(cell) > 0, code_cells)) # if no (or only one) code cells have been executed, return immediately if len(ex_code_cells) <= 1: return None # get sum of skips in execution order sum_skips = 0 for (i, cell) in enumerate(ex_code_cells): # if not on the last cell get the size of the skip if i != len(ex_code_cells) - 1: sum_skips += abs(get_exec(ex_code_cells[i + 1]) - get_exec(cell)) # calculate the average size of a skip return float(sum_skips) / float(len(ex_code_cells) - 1)
def get_language(nb_id): # get the notebook file nb = data.get_nb(nb_id) # look for the language language = None keys = nb.keys() # check if language is stored in the cells or notebook metadata if 'worksheets' in keys: # then language data is in each cell, get code cells and get the language from one of them code_cells = data.get_code_cells(nb_id) for cell in code_cells: if language != None: break else: language = cell['language'] elif 'kernelspec' in nb['metadata'].keys(): # then language data is in the metadata kernelspec = nb['metadata']['kernelspec'] keys = kernelspec.keys() if 'language' in keys: language = kernelspec['language'] elif 'name' in keys: language = kernelspec['name'] else: language = None else: # language data not recorded language = None return language
def count_non_exec(nb_id): # get code cells code_cells = data.get_code_cells(nb_id) # filter down to those that have a non-empty source def condition(cell): keys = cell.keys() field = "" if 'input' in keys: field = 'input' elif 'source' in keys: field = 'source' return len(cell[field]) > 0 non_empty_code_cells = list(filter(condition, code_cells)) # filter down to those that have 0 execution count non_executed_code_cells = list( filter(lambda cell: get_exec(cell) == 0, non_empty_code_cells)) # return the length of the filtered list return len(non_executed_code_cells)
def output_cell_prop(nb_id): num_code_cells = len(data.get_code_cells(nb_id)) num_output_cells = output_cells(nb_id) return float(num_output_cells) / float(num_code_cells)
repo_id = notebook['repo_id'] # try generating the row of data row['nb_id'] = error_row['nb_id'] = nb_id row['repo_id'] = error_row['repo_id'] = repo_id # check if notebook has been filtered if notebook['filtered']: error_row['err_in'] = 'filtered out' error_writer.writerow(error_row) continue # check code cells may error if notebook file is empty try: # skip if there aren;t any code cells if len(data.get_code_cells(nb_id)) == 0: print(colored(identifier + ' has no code', 'yellow')) error_row['err_in'] = 'no code' error_writer.writerow(error_row) continue except: print(colored("nb file error in " + identifier, 'red')) error_row['err_in'] = 'nb file' error_writer.writerow(error_row) continue # check the api response try: repo_full_name = data.get_repo_metadata(nb_id)['full_name'] except: print(colored("api error in " + identifier, 'red'))