def get_rows_data(rows_files): file_sizes = [] partition_bounds = [] parts_file = [ x['path'] for x in rows_files if x['path'].endswith('parts') ] if parts_file: parts = hadoop_ls(parts_file[0]) for i, x in enumerate(parts): index = x['path'].split(f'{parts_file[0]}/part-')[1].split( '-')[0] if i < len(parts) - 1: test_index = parts[i + 1]['path'].split( f'{parts_file[0]}/part-')[1].split('-')[0] if test_index == index: continue file_sizes.append(x['size_bytes']) metadata_file = [ x['path'] for x in rows_files if x['path'].endswith('metadata.json.gz') ] if metadata_file: with hadoop_open(metadata_file[0], 'rb') as f: rows_meta = json.loads(f.read()) try: partition_bounds = [(x['start']['locus']['contig'], x['start']['locus']['position'], x['end']['locus']['contig'], x['end']['locus']['position']) for x in rows_meta['jRangeBounds']] except KeyError: pass return partition_bounds, file_sizes
def hail_metadata(t_path): """Create a metadata plot for a Hail Table or MatrixTable. Parameters ---------- t_path : str Path to the Hail Table or MatrixTable files. Returns ------- :class:`bokeh.plotting.figure.Figure` or :class:`bokeh.models.widgets.panels.Tabs` or :class:`bokeh.models.layouts.Column` """ def get_rows_data(rows_files): file_sizes = [] partition_bounds = [] parts_file = [ x['path'] for x in rows_files if x['path'].endswith('parts') ] if parts_file: parts = hadoop_ls(parts_file[0]) for i, x in enumerate(parts): index = x['path'].split(f'{parts_file[0]}/part-')[1].split( '-')[0] if i < len(parts) - 1: test_index = parts[i + 1]['path'].split( f'{parts_file[0]}/part-')[1].split('-')[0] if test_index == index: continue file_sizes.append(x['size_bytes']) metadata_file = [ x['path'] for x in rows_files if x['path'].endswith('metadata.json.gz') ] if metadata_file: with hadoop_open(metadata_file[0], 'rb') as f: rows_meta = json.loads(f.read()) try: partition_bounds = [(x['start']['locus']['contig'], x['start']['locus']['position'], x['end']['locus']['contig'], x['end']['locus']['position']) for x in rows_meta['jRangeBounds']] except KeyError: pass return partition_bounds, file_sizes def scale_file_sizes(file_sizes): min_file_size = min(file_sizes) * 1.1 total_file_size = sum(file_sizes) all_scales = [('T', 1e12), ('G', 1e9), ('M', 1e6), ('K', 1e3), ('', 1e0)] for overall_scale, overall_factor in all_scales: if total_file_size > overall_factor: total_file_size /= overall_factor break for scale, factor in all_scales: if min_file_size > factor: file_sizes = [x / factor for x in file_sizes] break total_file_size = f'{total_file_size:.1f} {overall_scale}B' return total_file_size, file_sizes, scale files = hadoop_ls(t_path) rows_file = [x['path'] for x in files if x['path'].endswith('rows')] entries_file = [x['path'] for x in files if x['path'].endswith('entries')] success_file = [ x['modification_time'] for x in files if x['path'].endswith('SUCCESS') ] metadata_file = [ x['path'] for x in files if x['path'].endswith('metadata.json.gz') ] if not metadata_file: raise FileNotFoundError('No metadata.json.gz file found.') with hadoop_open(metadata_file[0], 'rb') as f: overall_meta = json.loads(f.read()) rows_per_partition = overall_meta['components']['partition_counts'][ 'counts'] if not rows_file: raise FileNotFoundError('No rows directory found.') rows_files = hadoop_ls(rows_file[0]) data_type = 'Table' if entries_file: data_type = 'MatrixTable' rows_file = [ x['path'] for x in rows_files if x['path'].endswith('rows') ] rows_files = hadoop_ls(rows_file[0]) row_partition_bounds, row_file_sizes = get_rows_data(rows_files) total_file_size, row_file_sizes, row_scale = scale_file_sizes( row_file_sizes) panel_size = 480 subpanel_size = 120 if not row_partition_bounds: warning('Table is not partitioned. Only plotting file sizes') row_file_sizes_hist, row_file_sizes_edges = np.histogram( row_file_sizes, bins=50) p_file_size = figure(plot_width=panel_size, plot_height=panel_size) p_file_size.quad(right=row_file_sizes_hist, left=0, bottom=row_file_sizes_edges[:-1], top=row_file_sizes_edges[1:], fill_color="#036564", line_color="#033649") p_file_size.yaxis.axis_label = f'File size ({row_scale}B)' return p_file_size all_data = { 'partition_widths': [-1 if x[0] != x[2] else x[3] - x[1] for x in row_partition_bounds], 'partition_bounds': [f'{x[0]}:{x[1]}-{x[2]}:{x[3]}' for x in row_partition_bounds], 'spans_chromosome': [ 'Spans chromosomes' if x[0] != x[2] else 'Within chromosome' for x in row_partition_bounds ], 'row_file_sizes': row_file_sizes, 'row_file_sizes_human': [f'{x:.1f} {row_scale}B' for x in row_file_sizes], 'rows_per_partition': rows_per_partition, 'index': list(range(len(rows_per_partition))) } if entries_file: entries_rows_files = hadoop_ls(entries_file[0]) entries_rows_file = [ x['path'] for x in entries_rows_files if x['path'].endswith('rows') ] if entries_rows_file: entries_files = hadoop_ls(entries_rows_file[0]) entry_partition_bounds, entry_file_sizes = get_rows_data( entries_files) total_entry_file_size, entry_file_sizes, entry_scale = scale_file_sizes( entry_file_sizes) all_data['entry_file_sizes'] = entry_file_sizes all_data['entry_file_sizes_human'] = [ f'{x:.1f} {entry_scale}B' for x in row_file_sizes ] title = f'{data_type}: {t_path}' msg = f"Rows: {sum(all_data['rows_per_partition']):,}<br/>Partitions: {len(all_data['rows_per_partition']):,}<br/>Size: {total_file_size}<br/>" if success_file[0]: msg += success_file[0] tools = "hover,save,pan,box_zoom,reset,wheel_zoom" source = ColumnDataSource(pd.DataFrame(all_data)) p = figure(tools=tools, plot_width=panel_size, plot_height=panel_size) p.title.text = title p.xaxis.axis_label = 'Number of rows' p.yaxis.axis_label = f'File size ({row_scale}B)' color_map = factor_cmap('spans_chromosome', palette=Spectral8, factors=list(set(all_data['spans_chromosome']))) p.scatter('rows_per_partition', 'row_file_sizes', color=color_map, legend='spans_chromosome', source=source) p.legend.location = 'bottom_right' p.select_one(HoverTool).tooltips = [ (x, f'@{x}') for x in ('rows_per_partition', 'row_file_sizes_human', 'partition_bounds', 'index') ] p_stats = Div(text=msg) p_rows_per_partition = figure(x_range=p.x_range, plot_width=panel_size, plot_height=subpanel_size) p_file_size = figure(y_range=p.y_range, plot_width=subpanel_size, plot_height=panel_size) rows_per_partition_hist, rows_per_partition_edges = np.histogram( all_data['rows_per_partition'], bins=50) p_rows_per_partition.quad(top=rows_per_partition_hist, bottom=0, left=rows_per_partition_edges[:-1], right=rows_per_partition_edges[1:], fill_color="#036564", line_color="#033649") row_file_sizes_hist, row_file_sizes_edges = np.histogram( all_data['row_file_sizes'], bins=50) p_file_size.quad(right=row_file_sizes_hist, left=0, bottom=row_file_sizes_edges[:-1], top=row_file_sizes_edges[1:], fill_color="#036564", line_color="#033649") rows_grid = gridplot([[p_rows_per_partition, p_stats], [p, p_file_size]]) if 'entry_file_sizes' in all_data: title = f'Statistics for {data_type}: {t_path}' msg = f"Rows: {sum(all_data['rows_per_partition']):,}<br/>Partitions: {len(all_data['rows_per_partition']):,}<br/>Size: {total_entry_file_size}<br/>" if success_file[0]: msg += success_file[0] source = ColumnDataSource(pd.DataFrame(all_data)) p = figure(tools=tools, plot_width=panel_size, plot_height=panel_size) p.title.text = title p.xaxis.axis_label = 'Number of rows' p.yaxis.axis_label = f'File size ({entry_scale}B)' color_map = factor_cmap('spans_chromosome', palette=Spectral8, factors=list(set( all_data['spans_chromosome']))) p.scatter('rows_per_partition', 'entry_file_sizes', color=color_map, legend='spans_chromosome', source=source) p.legend.location = 'bottom_right' p.select_one(HoverTool).tooltips = [ (x, f'@{x}') for x in ('rows_per_partition', 'entry_file_sizes_human', 'partition_bounds', 'index') ] p_stats = Div(text=msg) p_rows_per_partition = figure(x_range=p.x_range, plot_width=panel_size, plot_height=subpanel_size) p_rows_per_partition.quad(top=rows_per_partition_hist, bottom=0, left=rows_per_partition_edges[:-1], right=rows_per_partition_edges[1:], fill_color="#036564", line_color="#033649") p_file_size = figure(y_range=p.y_range, plot_width=subpanel_size, plot_height=panel_size) row_file_sizes_hist, row_file_sizes_edges = np.histogram( all_data['entry_file_sizes'], bins=50) p_file_size.quad(right=row_file_sizes_hist, left=0, bottom=row_file_sizes_edges[:-1], top=row_file_sizes_edges[1:], fill_color="#036564", line_color="#033649") entries_grid = gridplot([[p_rows_per_partition, p_stats], [p, p_file_size]]) return Tabs(tabs=[ Panel(child=entries_grid, title='Entries'), Panel(child=rows_grid, title='Rows') ]) else: return rows_grid