def run(self): run_metrics = py_interop_run_metrics.run_metrics() run_metrics.run_info() valid_to_load = py_interop_run.uchar_vector(py_interop_run.MetricCount, 0) py_interop_run_metrics.list_summary_metrics_to_load(valid_to_load) run_metrics.read(self.runfolder, valid_to_load) summary = py_interop_summary.run_summary() py_interop_summary.summarize_run_metrics(run_metrics, summary) lanes = summary.lane_count() reads = self.get_non_index_reads(summary) for lane in range(lanes): # The interop library uses zero based indexing, however most people uses read 1/2 # to denote the different reads, this enumeration is used to transform from # zero based indexing to this form. /JD 2017-10-27 for new_read_nbr, original_read_nbr in enumerate(reads): read = summary.at(original_read_nbr).at(lane) error_rate = read.error_rate().mean() percent_q30 = read.percent_gt_q30() self._send_to_subscribers(("error_rate", { "lane": lane + 1, "read": new_read_nbr + 1, "error_rate": error_rate })) self._send_to_subscribers(("percent_q30", { "lane": lane + 1, "read": new_read_nbr + 1, "percent_q30": percent_q30 }))
def get_qscore_dataframe(run_folder: Path) -> pd.DataFrame: """ Returns a pandas DataFrame containing x (Q-score), y (Total in millions) values for the run """ df_dict = {} x_vals = [] y_vals = [] run_metrics = py_interop_run_metrics.run_metrics() valid_to_load = py_interop_run.uchar_vector(py_interop_run.MetricCount, 0) valid_to_load[py_interop_run.Q] = 1 run_metrics.read(str(run_folder), valid_to_load) bar_data = py_interop_plot.bar_plot_data() boundary = 30 options = py_interop_plot.filter_options( run_metrics.run_info().flowcell().naming_method()) py_interop_plot.plot_qscore_histogram(run_metrics, options, bar_data, boundary) for i in range(bar_data.size() - 1): x = [bar_data.at(i).at(j).x() for j in range(bar_data.at(i).size())] y = [bar_data.at(i).at(j).y() for j in range(bar_data.at(i).size())] x_vals += x y_vals += y df_dict['x'] = x_vals df_dict['y'] = y_vals df = pd.DataFrame.from_dict(df_dict) return df
def main(): """ Retrieve run folder paths from the command line Ensure only metrics required for summary are loaded Load the run metrics Calculate the summary metrics Display error by lane, read """ logging.basicConfig(level=logging.INFO) run_metrics = py_interop_run_metrics.run_metrics() summary = py_interop_summary.run_summary() valid_to_load = py_interop_run.uchar_vector(py_interop_run.MetricCount, 0) py_interop_run_metrics.list_summary_metrics_to_load(valid_to_load) for run_folder_path in sys.argv[1:]: run_folder = os.path.basename(run_folder_path) try: run_metrics.read(run_folder_path, valid_to_load) except ex: logging.warn("Skipping - cannot read RunInfo.xml: %s - %s" % (run_folder, str(ex))) continue py_interop_summary.summarize_run_metrics(run_metrics, summary) error_rate_read_lane_surface = numpy.zeros( (summary.size(), summary.lane_count(), summary.surface_count())) for read_index in range(summary.size()): for lane_index in range(summary.lane_count()): for surface_index in range(summary.surface_count()): error_rate_read_lane_surface[read_index, lane_index, surface_index] = \ summary.at(read_index).at(lane_index).at(surface_index).error_rate().mean() logging.info("Run Folder: " + run_folder) for read_index in range(summary.size()): read_summary = summary.at(read_index) logging.info("Read " + str(read_summary.read().number()))
def parse_single_json(run_folder_path, xrange=None): run_folder = os.path.basename(run_folder_path) run_metrics = py_interop_run_metrics.run_metrics() summary = py_interop_summary.run_summary() valid_to_load = py_interop_run.uchar_vector(py_interop_run.MetricCount, 0) py_interop_run_metrics.list_summary_metrics_to_load(valid_to_load) percent_occupied_df = pd.DataFrame if 'MyRun' not in str(run_folder_path): percent_occupied_df = get_percent_occupied_by_lane(run_folder_path) summary_dict = {} try: run_metrics.read(run_folder_path, valid_to_load) py_interop_summary.summarize_run_metrics(run_metrics, summary) logging.info("Read: {}, Lane: {}".format(summary.size(), summary.lane_count())) for read_index in range(summary.size()): logging.info("Read {}".format(read_index + 1)) summary_dict.setdefault("run", run_folder) for lane_index in range(summary.lane_count()): read_summary = summary.at(read_index) lane_summary = read_summary.at(lane_index) summary_dict.setdefault("data", []).append({ "runname": parse_runid(run_folder), "read": read_summary.read().number(), "lane": lane_summary.lane(), "density": parse_float(lane_summary.density().mean() / 1000), "density_stddev": parse_float(lane_summary.density().stddev() / 1000), "clusterpf": parse_float(lane_summary.percent_pf().mean()), "clusterpf_stddev": parse_float(lane_summary.percent_pf().stddev()), "readsm": parse_float(lane_summary.reads() / 1000000), "readspfm": parse_float(lane_summary.reads_pf() / 1000000), "q30": parse_float(lane_summary.percent_gt_q30()), "aligned": parse_float(lane_summary.percent_aligned().mean()), "aligned_stddev": parse_float(lane_summary.percent_aligned().stddev()), "errorrate": parse_float(lane_summary.error_rate().mean()), "errorrate_stddev": parse_float(lane_summary.error_rate().stddev()), "percent_occupied": parse_float( parse_lane_occupancy(lane_index, percent_occupied_df)) }) return summary_dict except Exception as ex: logging.warn("Skipping - ERROR: %s - %s" % (run_folder, str(ex)))
def test_invalid_metric_type(self): """ Test that exceptions can be caught and they have the expected message """ valid_to_load = py_interop_run.uchar_vector() try: py_interop_run_metrics.list_metrics_to_load("Unknown", valid_to_load, py_interop_run.NovaSeq, True) self.fail("invalid_metric_type should have been thrown") except py_interop_run_metrics.invalid_metric_type as ex: self.assertEqual(str(ex).split('\n')[0], "Unsupported metric type: Unknown")
def test_invalid_parameter(self): """ Test that exceptions can be caught and they have the expected message """ valid_to_load = py_interop_run.uchar_vector(2) run_metrics = py_interop_run_metrics.run_metrics() try: run_metrics.read_metrics("", 3, valid_to_load, 1) self.fail("invalid_parameter should have been thrown") except py_interop_run_metrics.invalid_parameter as ex: self.assertEqual(str(ex).split('\n')[0], "Boolean array valid_to_load does not match expected number of metrics: 2 != 23")
def get_qc_run_summary(ngs_folder_path): run_folder = ngs_folder_path run_metrics = py_interop_run_metrics.run_metrics() valid_to_load = py_interop_run.uchar_vector(py_interop_run.MetricCount, 0) py_interop_run_metrics.list_summary_metrics_to_load(valid_to_load) run_folder = run_metrics.read(run_folder, valid_to_load) summary = py_interop_summary.run_summary() py_interop_summary.summarize_run_metrics(run_metrics, summary) return summary
def get_run_metrics_handle(run_dir): """ Load the goodies from the .bin files in the InterOp directory. This black magic is copy-pasted straight out of the tutorial linked above! """ #print("Examining: {}".format(run_dir)) valid_to_load = py_interop_run.uchar_vector(py_interop_run.MetricCount, 0) for v2l in (py_interop_run.Tile, py_interop_run.ExtendedTile): valid_to_load[v2l] = 1 run_metrics = py_interop_run_metrics.run_metrics() run_metrics.read(run_dir, valid_to_load) return run_metrics
def __init__(self, run_folder_path): # Initialize interop objects self.run_metrics = py_interop_run_metrics.run_metrics() valid_to_load = py_interop_run.uchar_vector(py_interop_run.MetricCount, 0) py_interop_run_metrics.list_summary_metrics_to_load(valid_to_load) # Read from run folder self.run_metrics.read(run_folder_path, valid_to_load) # Load up summary metrics self.summary = py_interop_summary.run_summary() py_interop_summary.summarize_run_metrics(self.run_metrics, self.summary) # Cached result tables for subsequent calls self.run_summary_df = None self.read_summary_dfs = {}
def plot_percent_base(run_folder: str, output_svg="percent_base.svg"): """ Plots the base % across each cycle. Each line represents a different base. Reference lines are added for each read. Base %: The percentage of clusters for which the selected base (A, C, T, or G) has been called. """ # Initialize interop objects run_metrics = py_interop_run_metrics.run_metrics() valid_to_load = py_interop_run.uchar_vector(py_interop_run.MetricCount, 0) py_interop_run_metrics.list_summary_metrics_to_load(valid_to_load) # Read from the run folder run_metrics.read(run_folder, valid_to_load) logger.info('Generating % base plot') plot_data = py_interop_plot.candle_stick_plot_data() options = py_interop_plot.filter_options(run_metrics.run_info().flowcell().naming_method()) py_interop_plot.plot_by_cycle(run_metrics, "BasePercent", options, plot_data) # Plot each base for base_index in range(plot_data.size()): line_data = plot_data.at(base_index) x = [line_data.at(i).x() for i in range(line_data.size())] y = [line_data.at(i).y() for i in range(line_data.size())] plt.plot(x, y, color=line_data.color(), linewidth=0.5, label=line_data.title()) # Plot reference lines for reads read_vector = run_metrics.run_info().reads() for read_index in range(read_vector.size()): read_name = f'R{read_vector[read_index].number()}' cycle_start = read_vector[read_index].first_cycle() plt.axvline(x=cycle_start, color='purple', linestyle='--', linewidth=0.35) plt.text(cycle_start, plt.gca().get_ylim()[1], read_name, fontsize=8, color='purple') # Plot settings axes_data = plot_data.xyaxes() plt.xlabel(axes_data.x().label(), fontsize=10) plt.ylabel(axes_data.y().label(), fontsize=10) plt.title(plot_data.title(), fontsize=10) plt.legend() plt.ylim([axes_data.y().min(), axes_data.y().max()]) plt.xlim([axes_data.x().min(), axes_data.x().max()]) # Save figure plt.savefig(output_svg)
def parse(run_folder, dictionary): run_metrics = py_interop_run_metrics.run_metrics() valid_to_load = py_interop_run.uchar_vector(py_interop_run.MetricCount, 0) py_interop_run_metrics.list_summary_metrics_to_load(valid_to_load) run_folder = run_metrics.read(run_folder, valid_to_load) summary = py_interop_summary.run_summary() py_interop_summary.summarize_run_metrics(run_metrics, summary) # Parse data from interop files -- % reads over Q30, cluster density, clusters passing filter dictionary["Percent Q30"] = round(summary.total_summary().percent_gt_q30(), 2) dictionary["Cluster density"] = round( summary.at(0).at(0).density().mean() / 1000, 2) dictionary["Percent PF"] = round( summary.at(0).at(0).percent_pf().mean(), 2) dictionary["Phasing"] = round(summary.at(0).at(0).phasing().mean(), 2) dictionary["Prephasing"] = round( summary.at(0).at(0).prephasing().mean(), 2) dictionary["Error rate"] = round(summary.total_summary().error_rate(), 2) dictionary["Aligned"] = round(summary.total_summary().percent_aligned(), 2)
def get_qc_run_summary(ngs_folder_path): """ Creates the InterOp summary object. Input: ngs_folder_path = The Illumina folder containing the QC data. Should contain the following: 1) InterOp/ containing *MetricsOut.bin files. 2) RunInfo.xml 3) RunParameters.xml Output: Returns False if an error occurs otherwise: summary = The InterOp simmary object. See https://github.com/Illumina/interop/blob/master/docs/src/Tutorial_01_Intro.ipynb """ try: run_folder = ngs_folder_path except: return False #An error has occured. run_metrics = py_interop_run_metrics.run_metrics() valid_to_load = py_interop_run.uchar_vector(py_interop_run.MetricCount, 0) py_interop_run_metrics.list_summary_metrics_to_load(valid_to_load) run_folder = run_metrics.read(run_folder, valid_to_load) summary = py_interop_summary.run_summary() py_interop_summary.summarize_run_metrics(run_metrics, summary) return summary
def interop_parse(self): """ Use interop to parse the files in the InterOp folder to extract the number of reads mapping to PhiX as well as the error rate """ # Parse the files and load the data run_metrics = py_interop_run_metrics.run_metrics() valid_to_load = py_interop_run.uchar_vector(py_interop_run.MetricCount, 0) py_interop_run_metrics.list_summary_metrics_to_load(valid_to_load) run_metrics.read(self.path, valid_to_load) summary = py_interop_summary.run_summary() py_interop_summary.summarize_run_metrics(run_metrics, summary) # PhiX error rate for run over all "usable cycles" errorrate = summary.total_summary().error_rate() # Percent aligned PhiX pctaligned = summary.total_summary().percent_aligned() # Add the error rate and the percent of reads that align to PhiX to the metadata object for sample in self.metadata: sample.run.error_rate = '{:.2f}'.format(errorrate) sample.run.phix_aligned = '{:.2f}'.format(pctaligned)
def get_percent_occupied_by_lane(run_folder_path): df = pd.DataFrame for item in NOVASEQ: if 'myrun' not in run_folder_path.lower() and item.lower( ) in run_folder_path.lower(): valid_to_load = py_interop_run.uchar_vector( py_interop_run.MetricCount, 0) valid_to_load[py_interop_run.ExtendedTile] = 1 valid_to_load[py_interop_run.Tile] = 1 valid_to_load[py_interop_run.Extraction] = 1 run_metrics = py_interop_run_metrics.run_metrics() run_metrics.read(run_folder_path, valid_to_load) columns = py_interop_table.imaging_column_vector() py_interop_table.create_imaging_table_columns(run_metrics, columns) headers = get_headers(columns, run_folder_path) column_count = py_interop_table.count_table_columns(columns) row_offsets = py_interop_table.map_id_offset() py_interop_table.count_table_rows(run_metrics, row_offsets) data = np.zeros((row_offsets.size(), column_count), dtype=np.float32) py_interop_table.populate_imaging_table_data( run_metrics, columns, row_offsets, data.ravel()) header_subset = ["Lane", "Tile", "Cycle", "% Occupied"] header_index = [(header, headers.index(header)) for header in header_subset] ids = np.asarray( [headers.index(header) for header in header_subset[:3]]) data_for_selected_header_subset = [] for label, col in header_index: data_for_selected_header_subset.append( (label, pd.Series([val for val in data[:, col]], index=[tuple(r) for r in data[:, ids]]))) df = pd.DataFrame.from_dict(dict(data_for_selected_header_subset)) return df
def parse_illumina_interop(run_dir): # Read interop run_metrics = py_interop_run_metrics.run_metrics() valid_to_load = py_interop_run.uchar_vector(py_interop_run.MetricCount, 0) py_interop_run_metrics.list_summary_metrics_to_load(valid_to_load) try: run_metrics.read(run_dir, valid_to_load) except Exception: sys.stderr.write("Cannot parse information in InterOp") sys.exit(2) summary = py_interop_summary.run_summary() py_interop_summary.summarize_run_metrics(run_metrics, summary) lanes = summary.lane_count() reads = summary.size() # Parse the interop stats lane by lane for non-index reads run_stats_summary = dict() for lane in range(lanes): lane_nbr = getattr(summary.at(0).at(lane), "lane")() for read in range(reads): if not summary.at(read).read().is_index(): stats = dict() stats.update({"density" : getattr(summary.at(read).at(lane), "density")().mean()/1000}) stats.update({"error_rate" : getattr(summary.at(read).at(lane), "error_rate")().mean()}) stats.update({"first_cycle_intensity" : getattr(summary.at(read).at(lane), "first_cycle_intensity")().mean()}) stats.update({"percent_aligned" : getattr(summary.at(read).at(lane), "percent_aligned")().mean()}) stats.update({"percent_gt_q30" : getattr(summary.at(read).at(lane), "percent_gt_q30")()}) stats.update({"percent_pf" : getattr(summary.at(read).at(lane), "percent_pf")().mean()}) stats.update({"phasing" : getattr(summary.at(read).at(lane), "phasing")().mean()}) stats.update({"prephasing" : getattr(summary.at(read).at(lane), "prephasing")().mean()}) stats.update({"reads_pf" : getattr(summary.at(read).at(lane), "reads_pf")()}) stats.update({"yield_g" : getattr(summary.at(read).at(lane), "yield_g")()}) if lane_nbr not in list(run_stats_summary.keys()): run_stats_summary.update({lane_nbr: {read: stats}}) else: run_stats_summary[lane_nbr].update({read : stats}) return run_stats_summary
def plot_tile_intensity(run_folder: str, output_svg_prefix='max_intensity'): # Initialize interop objects run_metrics = py_interop_run_metrics.run_metrics() valid_to_load = py_interop_run.uchar_vector(py_interop_run.MetricCount, 0) valid_to_load[py_interop_run.Extraction] = 1 # Read from the run folder run_metrics.read(run_folder, valid_to_load) # Get the extraction metrics extraction_metrics = run_metrics.extraction_metric_set() extraction_metrics.rebuild_index(True) # Format a DataFrame df = [] for lane in extraction_metrics.lanes(): print(f"Processing lane {lane}") for tile in extraction_metrics.tile_numbers_for_lane(lane): if not isinstance(tile, int): continue print(f"Processing tile {tile}") for cycle in range(extraction_metrics.max_cycle()): print(f"Processing cycle {cycle}") try: extraction_metric = extraction_metrics.get_metric( lane, tile, cycle) except: continue for channel in range(extraction_metrics.channel_count()): df.append( dict(channel=channel, lane=lane, tile=tile, cycle=cycle, max_intensity=extraction_metric.max_intensity( channel))) df = pd.DataFrame(df) # Iterate over lanes for lane, lane_df in df.groupby('lane'): # Plot the change in max intensity over cycles sns.lineplot( data=lane_df, x='cycle', y='max_intensity', ) # Set a title plt.title(f"Lane {lane}") # Save to the PDF plt.savefig(f"{output_svg_prefix}_{lane}.svg") plt.close()
def parsing_run_metrics_files(local_run_metric_folder, run_process_obj, experiment_name): ''' Description: The function parse the information from the run metric files Input: local_run_metric_folder # local folder with the run metric files run_process_obj # RunProcess object for this run experiment_name # experiment name Import: py_interop_run py_interop_run_metrics Variables: bin_run_stats_summary_list # list of dictionnary with the summary information run_stats_read_list # list of dictionnary with the read information Return: bin_run_stats_summary_list, run_stats_read_list ''' logger = logging.getLogger(__name__) logger.debug('%s : Starting function parsing_run_metrics', experiment_name) run_param_obj = RunningParameters.objects.get(runName_id=run_process_obj) # get the number of lanes for the run number_of_lanes = int(run_param_obj.get_number_of_lanes()) # get number of reads for the run num_of_reads = run_param_obj.get_number_of_reads() logger.info('%s : Fetched run information needed for running metrics', experiment_name) run_metrics = py_interop_run_metrics.run_metrics() valid_to_load = py_interop_run.uchar_vector(py_interop_run.MetricCount, 0) py_interop_run_metrics.list_summary_metrics_to_load(valid_to_load) run_metric_folder = run_metrics.read(local_run_metric_folder) summary = py_interop_summary.run_summary() py_interop_summary.summarize_run_metrics(run_metrics, summary) bin_run_stats_summary_list = [] logger.info('%s : Starts collecting data for run metric ', experiment_name) # get the Run Summary for each Read for read_level in range(num_of_reads): run_summary_stats_level = {} # summary yield total run_summary_stats_level['yieldTotal'] = format( summary.at(read_level).summary().yield_g(), '.3f') # summary projected total yield run_summary_stats_level['projectedTotalYield'] = format( summary.at(read_level).summary().projected_yield_g(), '.3f') # percent yield run_summary_stats_level['aligned'] = format( summary.at(read_level).summary().percent_aligned(), '.3f') # Error rate run_summary_stats_level['errorRate'] = format( summary.at(read_level).summary().error_rate(), '.3f') # intensity cycle 1 run_summary_stats_level['intensityCycle'] = str( round(summary.at(read_level).summary().first_cycle_intensity())) # Q30 run_summary_stats_level['biggerQ30'] = format( summary.at(read_level).summary().percent_gt_q30(), '.3f') run_summary_stats_level['level'] = str(read_level + 1) bin_run_stats_summary_list.append(run_summary_stats_level) logger.info('%s : Parsed run Metrics on summary level ', experiment_name) # get the run summary for Total run_summary_stats_level = {} # total summary run_summary_stats_level['yieldTotal'] = format( summary.total_summary().yield_g(), '.3f') # total projected_yield_g run_summary_stats_level['projectedTotalYield'] = format( summary.total_summary().projected_yield_g(), '.3f') # total percent aligned run_summary_stats_level['aligned'] = format( summary.total_summary().percent_aligned(), '.3f') # total error rate run_summary_stats_level['errorRate'] = format( summary.total_summary().error_rate(), '.3f') # total intensity cycle run_summary_stats_level['intensityCycle'] = str( round(summary.total_summary().first_cycle_intensity())) # total Q 30 run_summary_stats_level['biggerQ30'] = format( summary.total_summary().percent_gt_q30(), '.3f') run_summary_stats_level['level'] = 'Total' logger.info('%s : Parsed run Metrics on Total lane', experiment_name) bin_run_stats_summary_list.append(run_summary_stats_level) # get the run summary for non index run_summary_stats_level = {} # non index yield run_summary_stats_level['yieldTotal'] = format( summary.nonindex_summary().yield_g(), '.3f') # non index projected yield run_summary_stats_level['projectedTotalYield'] = format( summary.nonindex_summary().projected_yield_g(), '.3f') # non index percent aligned run_summary_stats_level['aligned'] = format( summary.nonindex_summary().percent_aligned(), '.3f') # non index percent error rate run_summary_stats_level['errorRate'] = format( summary.nonindex_summary().error_rate(), '.3f') # non index intensity cycle run_summary_stats_level['intensityCycle'] = str( round(summary.nonindex_summary().first_cycle_intensity())) # non index Q 30 run_summary_stats_level['biggerQ30'] = format( summary.nonindex_summary().percent_gt_q30(), '.3f') run_summary_stats_level['level'] = 'Non Index' logger.info('%s : Parsed run metric for Non Index lane', experiment_name) bin_run_stats_summary_list.append(run_summary_stats_level) ### information per reads run_stats_read_list = [] #lan_summary= py_interop_summary.lane_summary() # Tiles for read_number in range(num_of_reads): for lane_number in range(number_of_lanes): logger.info( '%s : Processing run metrics stats on Read %s and on Lane %s', experiment_name, read_number, lane_number) run_read_stats_level = {} run_read_stats_level['tiles'] = str( int(summary.at(read_number).at(lane_number).tile_count()) * 2) # Density (k/mm2) divide the value by 1000 to have it K/mm2 # get the +/- with the steddev try: read_lane_density_mean = str( round( float( summary.at(read_number).at( lane_number).density().mean()) / 1000)) read_lane_density_stddev = str( round( float( summary.at(read_number).at( lane_number).density().stddev()) / 1000)) except: read_lane_density_mean = 'NaN' read_lane_density_stddev = 'NaN' string_message = experiment_name + ' : Unable to convert to float ' logging_warnings(string_message, False) run_read_stats_level[ 'density'] = read_lane_density_mean + ' ' + chr( 177) + ' ' + read_lane_density_stddev # cluster _pf in % try: read_lane_percent_pf_mean = format( summary.at(read_number).at( lane_number).percent_pf().mean(), '.3f') read_lane_percent_pf_stddev = format( summary.at(read_number).at( lane_number).percent_pf().stddev(), '.3f') except: read_lane_percent_pf_mean = 'NaN' read_lane_percent_pf_stddev = 'NaN' string_message = experiment_name + ' : Unable to format to float read_lane_percent_pf' logging_warnings(string_message, False) run_read_stats_level[ 'cluster_PF'] = read_lane_percent_pf_mean + ' ' + chr( 177) + ' ' + read_lane_percent_pf_stddev # phas/ prepas in % try: read_lane_phasing_mean = format( summary.at(read_number).at(lane_number).phasing().mean(), '.3f') read_lane_phasing_dev = format( summary.at(read_number).at(lane_number).phasing().stddev(), '.1f') read_lane_prephasing_mean = format( summary.at(read_number).at( lane_number).prephasing().mean(), '.3f') read_lane_prephasing_stddev = format( summary.at(read_number).at( lane_number).prephasing().stddev(), '.3f') except: read_lane_phasing_mean, read_lane_phasing_dev, read_lane_prephasing_mean, read_lane_prephasing_stddev = 'NaN', 'NaN', 'NaN', 'NaN' string_message = experiment_name + ' : Unable to format to float read_lane_phasing' logging_warnings(string_message, False) run_read_stats_level['phas_prephas'] = read_lane_phasing_mean + ' ' + chr( 177 ) + ' ' + read_lane_phasing_dev + ' / ' + read_lane_prephasing_mean + ' ' + chr( 177) + ' ' + read_lane_prephasing_stddev # reads (M) try: run_read_stats_level['reads'] = format( float(summary.at(read_number).at(lane_number).reads()) / 1000000, '.3f') except: run_read_stats_level['reads'] = 'NaN' string_message = experiment_name + ' : Unable to format to float run_read_stats_level[reads]' logging_warnings(string_message, False) #reads PF (M) try: run_read_stats_level['reads_PF'] = format( float(summary.at(read_number).at(lane_number).reads_pf()) / 1000000, '.3f') except: run_read_stats_level['reads_PF'] = 'NaN' string_message = experiment_name + ' : Unable to format to float run_read_stats_level[reads_PF]' logging_warnings(string_message, False) # percent q30 try: run_read_stats_level['q30'] = format( summary.at(read_number).at(lane_number).percent_gt_q30(), '.3f') except: run_read_stats_level['q30'] = 'NaN' string_message = experiment_name + ' : Unable to format to float run_read_stats_level[q30]' logging_warnings(string_message, False) # yield _g try: run_read_stats_level['yields'] = format( summary.at(read_number).at(lane_number).yield_g(), '.3f') except: run_read_stats_level['yields'] = 'NaN' string_message = experiment_name + ' : Unable to format to float run_read_stats_level[yields]' logging_warnings(string_message, False) # cycles err Rate try: run_read_stats_level['cyclesErrRated'] = str( summary.at(read_number).at(lane_number).cycle_state(). error_cycle_range().first_cycle()) except: run_read_stats_level['cyclesErrRated'] = 'NaN' string_message = experiment_name + ' : Unable to format to float run_read_stats_level[cyclesErrRated]' logging_warnings(string_message, False) #percent_aligned try: read_lane_percent_aligned_mean = format( summary.at(read_number).at( lane_number).percent_aligned().mean(), '.3f') read_lane_percent_aligned_stddev = format( summary.at(read_number).at( lane_number).percent_aligned().stddev(), '3f') except: read_lane_percent_aligned_mean, read_lane_percent_aligned_stddev = 'NaN', 'NaN' string_message = experiment_name + ' : Unable to format to float read_lane_percent_aligned_mean' logging_warnings(string_message, False) run_read_stats_level[ 'aligned'] = read_lane_percent_aligned_mean + ' ' + chr( 177) + ' ' + read_lane_percent_aligned_stddev #error rate try: read_lane_error_rate_mean = format( summary.at(read_number).at( lane_number).error_rate().mean(), '.3f') read_lane_error_rate_stddev = format( summary.at(read_number).at( lane_number).error_rate().stddev(), '.3f') except: read_lane_error_rate_mean, read_lane_error_rate_stddev = 'NaN', 'NaN' string_message = experiment_name + ' : Unable to format to float read_lane_error_rate_mean' logging_warnings(string_message, False) run_read_stats_level[ 'errorRate'] = read_lane_error_rate_mean + ' ' + chr( 177) + ' ' + read_lane_error_rate_stddev #error rate_35 try: read_lane_error_rate_35_mean = format( summary.at(read_number).at( lane_number).error_rate_35().mean(), '.3f') read_lane_error_rate_35_stddev = format( summary.at(read_number).at( lane_number).error_rate_35().stddev(), '.3f') except: read_lane_error_rate_35_mean, read_lane_error_rate_35_stddev = 'NaN', 'NaN' string_message = experiment_name + ' : Unable to format to float read_lane_error_rate_35_mean' logging_warnings(string_message, False) run_read_stats_level[ 'errorRate35'] = read_lane_error_rate_35_mean + ' ' + chr( 177) + ' ' + read_lane_error_rate_35_stddev #error rate 50 try: read_lane_error_rate_50_mean = format( summary.at(read_number).at( lane_number).error_rate_50().mean(), '.3f') read_lane_error_rate_50_stddev = format( summary.at(read_number).at( lane_number).error_rate_50().stddev(), '.3f') except: read_lane_error_rate_50_mean, read_lane_error_rate_50_stddev = 'NaN', 'NaN' string_message = experiment_name + ' : Unable to format to float read_lane_error_rate_50_mean' logging_warnings(string_message, False) run_read_stats_level[ 'errorRate50'] = read_lane_error_rate_50_mean + ' ' + chr( 177) + ' ' + read_lane_error_rate_50_stddev #error rate 75 try: read_lane_error_rate_75_mean = format( summary.at(read_number).at( lane_number).error_rate_75().mean(), '.3f') read_lane_error_rate_75_stddev = format( summary.at(read_number).at( lane_number).error_rate_75().stddev(), '.3f') except: read_lane_error_rate_75_mean, read_lane_error_rate_75_stddev = 'NaN', 'NaN' string_message = experiment_name + ' : Unable to format to float read_lane_error_rate_75_mean' logging_warnings(string_message, False) run_read_stats_level[ 'errorRate75'] = read_lane_error_rate_75_mean + ' ' + chr( 177) + ' ' + read_lane_error_rate_75_stddev #error rate 100 try: read_lane_error_rate_100_mean = format( summary.at(read_number).at( lane_number).error_rate_100().mean(), '.3f') read_lane_error_rate_100_stddev = format( summary.at(read_number).at( lane_number).error_rate_100().stddev(), '.3f') except: read_lane_error_rate_100_mean, read_lane_error_rate_100_stddev = 'NaN', 'NaN' string_message = experiment_name + ' : Unable to format to float read_lane_error_rate_100_mean' logging_warnings(string_message, False) run_read_stats_level[ 'errorRate100'] = read_lane_error_rate_100_mean + ' ' + chr( 177) + ' ' + read_lane_error_rate_100_stddev # intensity cycle 1 try: read_lane_intensity_cycle_mean = format( summary.at(read_number).at( lane_number).first_cycle_intensity().mean(), '.3f') # get tiles for read 1 and lane 1 read_lane_intensity_cycle_stddev = format( summary.at(read_number).at( lane_number).first_cycle_intensity().stddev(), '.3f') except: read_lane_intensity_cycle_mean, read_lane_intensity_cycle_stddev = 'NaN', 'NaN' string_message = experiment_name + ' : Unable to format to float read_lane_intensity_cycle_mean' logging_warnings(string_message, False) run_read_stats_level[ 'intensityCycle'] = read_lane_intensity_cycle_mean + ' ' + chr( 177) + ' ' + read_lane_intensity_cycle_stddev run_read_stats_level['read'] = str(read_number + 1) run_read_stats_level['lane'] = str(lane_number + 1) # append run_read_stats_level information to run_stats_read_list run_stats_read_list.append(run_read_stats_level) logger.debug('%s : End function parsing_run_metrics', experiment_name) return bin_run_stats_summary_list, run_stats_read_list
def process_files(run_folders, channel, curve_type, **extra): ''' Read the InterOp data and plot the distortion curves for each input run folder :param run_folders: list of run folders to process :param output: output image name (optional) :param extra: unused arguments ''' run_metrics = py_interop_run_metrics.run_metrics() valid_to_load = py_interop_run.uchar_vector(py_interop_run.MetricCount, 0) valid_to_load[py_interop_run.OpticalModel] = 1 run_coeff_map = {} channels = None selected_channel_index = -1 for run_folder in run_folders: _logger.info("Parsing coefficients for %s" % run_folder) basename = os.path.basename(run_folder) try: run_metrics.read(run_folder, valid_to_load) except: _logger.warn("Skipping - cannot read RunInfo.xml: %s" % basename) continue if channels is None: channels = [ name.lower() for name in run_metrics.run_info().channels() ] try: selected_channel_index = channels.index(channel.lower()) except: selected_channel_index = -1 if channel != "All": raise RuntimeError("Channel not found: %s not in %s" % (channel, ",".join(channels))) if basename == "": basename = os.path.basename(os.path.dirname(run_folder)) if run_metrics.distortion_metric_set().empty(): _logger.info("No distortion coefficients found: Parsing log file") try: run_coeff_map[basename] = parse_log_file( run_folder, run_metrics.run_info()) except: _logger.warn("Skipping - cannot parse log file: %s" % basename) continue else: run_coeff_map[basename] = create_plot( run_metrics.distortion_metric_set()) if len(run_coeff_map) == 0: raise Exception("No run folders read") if selected_channel_index == -1: raise NotImplementedError("All channels is not implemented") colormap = cm.cool lane_count = run_metrics.run_info().flowcell().lane_count() surface_count = run_metrics.run_info().flowcell().surface_count() lane_surface_norm = matplotlib.colors.Normalize(vmin=0, vmax=surface_count * lane_count) for run_folder, (y_values, x_values) in run_coeff_map.iteritems(): _logger.info("Drawing curves for %s, %d" % (run_folder, len(y_values))) if len(y_values) == 0: _logger.warn("Skipping %s - missing curve" % run_folder) continue for fig in create_figure(run_folder, **extra): pylab.title(run_folder) pylab.xlabel("Distance from Center of Image") pylab.ylabel("Distortion (Pixels)") has_label = {} for lane, tile, y_for_channel in y_values: surface = int(str(tile)[0]) lane = int(lane) lane_surface = (surface - 1) * lane_count + lane - 1 label = None if lane_surface not in has_label: has_label[lane_surface] = True label = "%d_%d" % (lane, surface) pylab.plot(x_values, y_for_channel[selected_channel_index][curve_type], color=colormap(lane_surface_norm(lane_surface)), label=label) pylab.legend(loc='upper center') if extra['output'] == "": pylab.show()
def parse_interop_data(run_folder_dir, num_reads, num_lanes): """ Parses summary statistics out of interops data using the Illumina interops package """ # make empty dict to store output interop_dict = {'read_summaries': {}} # taken from illumina interops package documentation, all of this is required, # even though only the summary variable is used further on run_metrics = py_interop_run_metrics.run_metrics() valid_to_load = py_interop_run.uchar_vector(py_interop_run.MetricCount, 0) py_interop_run_metrics.list_summary_metrics_to_load(valid_to_load) run_folder = run_metrics.read(run_folder_dir, valid_to_load) summary = py_interop_summary.run_summary() py_interop_summary.summarize_run_metrics(run_metrics, summary) for read in range(num_reads): new_read = read + 1 if new_read not in interop_dict['read_summaries']: interop_dict['read_summaries'][new_read] = {} for lane in range(num_lanes): new_lane = lane + 1 if new_lane not in interop_dict['read_summaries'][new_read]: interop_dict['read_summaries'][new_read][new_lane] = {} interop_dict['read_summaries'][read + 1][ lane + 1]['percent_q30'] = summary.at(read).at(lane).percent_gt_q30() interop_dict['read_summaries'][read + 1][ lane + 1]['density'] = summary.at(read).at(lane).density().mean() interop_dict['read_summaries'][read + 1][lane + 1][ 'density_pf'] = summary.at(read).at(lane).density_pf().mean() interop_dict['read_summaries'][read + 1][ lane + 1]['cluster_count'] = summary.at(read).at( lane).density_pf().mean() interop_dict['read_summaries'][read + 1][ lane + 1]['cluster_count_pf'] = summary.at(read).at( lane).cluster_count_pf().mean() interop_dict['read_summaries'][read + 1][lane + 1][ 'error_rate'] = summary.at(read).at(lane).error_rate().mean() interop_dict['read_summaries'][read + 1][ lane + 1]['percent_aligned'] = summary.at(read).at( lane).percent_aligned().mean() interop_dict['read_summaries'][read + 1][lane + 1][ 'percent_pf'] = summary.at(read).at(lane).percent_pf().mean() interop_dict['read_summaries'][read + 1][ lane + 1]['phasing'] = summary.at(read).at(lane).phasing().mean() interop_dict['read_summaries'][read + 1][lane + 1][ 'prephasing'] = summary.at(read).at(lane).prephasing().mean() interop_dict['read_summaries'][read + 1][ lane + 1]['reads'] = summary.at(read).at(lane).reads() interop_dict['read_summaries'][read + 1][ lane + 1]['reads_pf'] = summary.at(read).at(lane).reads_pf() interop_dict['read_summaries'][read + 1][ lane + 1]['yield_g'] = summary.at(read).at(lane).yield_g() for key in interop_dict['read_summaries'][read + 1][lane + 1]: if math.isnan(interop_dict['read_summaries'][read + 1][lane + 1][key]): interop_dict['read_summaries'][read + 1][lane + 1][key] = None run_metrics = py_interop_run_metrics.run_metrics() valid_to_load = py_interop_run.uchar_vector(py_interop_run.MetricCount, 0) py_interop_run_metrics.list_index_metrics_to_load(valid_to_load) run_folder = run_metrics.read(run_folder_dir, valid_to_load) return interop_dict
# Assign lane artifacts in order of outputs (as we get them from LIMS), # and also rename the artifacts for laneno, art in zip(range(1,5), [o['uri'] for _, o in ios]): lane_artifacts[laneno] = art art.name = "Lane {}:1".format(laneno) # run_dir_all = glob.glob("/data/runScratch.boston/demultiplexed/*/*/{}".format(run_id)) run_dir_all = glob.glob("/data/runScratch.boston/nova-interop-temp-marius/{}".format(run_id)) if not run_dir_all: print("No run folder for", run_id) else: run_dir = run_dir_all[0] try: # Ignore parsing error, to not disturb the sequencer integrations # Parse InterOp data valid_to_load = py_interop_run.uchar_vector(py_interop_run.MetricCount, 0) py_interop_run_metrics.list_summary_metrics_to_load(valid_to_load) valid_to_load[py_interop_run.ExtendedTile] = 1 run_metrics = py_interop_run_metrics.run_metrics() run_metrics.read(run_dir, valid_to_load) summary = py_interop_summary.run_summary() py_interop_summary.summarize_run_metrics(run_metrics, summary) extended_tile_metrics = run_metrics.extended_tile_metric_set() read_count = summary.size() lane_count = summary.lane_count() if lane_count != len(lane_artifacts): raise RuntimeError("Error: Number of lanes in InterOp data: {}, does not match the number " "of lanes in LIMS: {}.".format(lane_count, len(lane_artifacts)))
def plot_occupancy(run_folder: str, output_jpg_prefix="occupancy"): """ To optimize loading concentrations on the NovaSeq platform, the % Occupied and % Pass Filter metrics can be plotted to determine if a run was underloaded, optimally loaded, or overloaded. More information: https://support.illumina.com/bulletins/2020/03/plotting---occupied-by---pass-filter-to-optimize-loading-concent.html """ # Initialize interop objects run_metrics = py_interop_run_metrics.run_metrics() valid_to_load = py_interop_run.uchar_vector(py_interop_run.MetricCount, 0) valid_to_load[py_interop_run.ExtendedTile] = 1 valid_to_load[py_interop_run.Tile] = 1 valid_to_load[py_interop_run.Extraction] = 1 # Read from the run folder run_metrics.read(run_folder, valid_to_load) # Create the columns columns = py_interop_table.imaging_column_vector() py_interop_table.create_imaging_table_columns(run_metrics, columns) headers = [] for i in range(columns.size()): column = columns[i] if column.has_children(): headers.extend( [f"{column.name()} ({subname})" for subname in column.subcolumns()]) else: headers.append(column.name()) column_count = py_interop_table.count_table_columns(columns) row_offsets = py_interop_table.map_id_offset() py_interop_table.count_table_rows(run_metrics, row_offsets) data = np.zeros((row_offsets.size(), column_count), dtype=np.float32) py_interop_table.populate_imaging_table_data( run_metrics, columns, row_offsets, data.ravel() ) # Make a DataFrame df = pd.DataFrame(data, columns=headers) # Skip if there is no data (% Occupied only available on NovaSeq) if df.shape[0] == 0 or "% Occupied" not in df: # Stop print("Occupancy plot skipped, no data available") return x = "% Occupied" y = "% Pass Filter" hues = ["Tile", "Lane", "Cycle"] # Make a few different types of plots for hue in hues: sns.scatterplot( data=df, x=x, y=y, hue=hue, alpha=0.5, linewidth=0, ) plt.xlim([0, 100]) plt.ylim([0, 100]) plt.legend(title=hue, bbox_to_anchor=[1.2, 0.9]) plt.tight_layout() plt.savefig(f"{output_jpg_prefix}_{hue.lower()}.jpg", dpi=600) plt.close()