def _match_on_cpu_time(self, jm_dataset: Dataset, wm_dataset: Dataset, jm_subset=None, wm_subset=None): jmdf = jm_subset if jm_subset is not None else jm_dataset.df wmdf = wm_subset if wm_subset is not None else wm_dataset.df # Round CPU time to account for rounding errors while matching float values jmdf['cpuApprox'] = jmdf[jm_dataset.col(Metric.CPU_TIME)].round() wmdf['cpuApprox'] = wmdf[wm_dataset.col(Metric.CPU_TIME)].round() jmdf_index = jmdf.index.name wmdf_index = wmdf.index.name self._prefix_columns(jmdf, 'jmdf_') self._prefix_columns(wmdf, 'wmdf_') matches = jmdf.reset_index().merge(wmdf.reset_index(), left_on='jmdf_cpuApprox', right_on='wmdf_cpuApprox') filtered = self._filter_matches(matches, jm_dataset, wm_dataset, jmdf_prefix='jmdf_', wmdf_prefix='wmdf_') perfect_matches = filtered.groupby(jmdf_index).filter( lambda x: len(x) == 1) return perfect_matches[[jmdf_index, wmdf_index]]
def _filter_matches(self, matches, jm_dataset: Dataset, wm_dataset, jmdf_prefix='jmdf_', wmdf_prefix='wmdf_'): timestamp_metrics = [Metric.START_TIME, Metric.STOP_TIME] for metric in timestamp_metrics: jmdf_ts_col = jmdf_prefix + jm_dataset.col(metric) wmdf_ts_col = wmdf_prefix + wm_dataset.col(metric) matches = matches[ (self._timestamp_diff_series(matches[jmdf_ts_col], matches[wmdf_ts_col]) < self.timestamp_tolerance) # | # (matches[jmdf_ts_col].isnull()) | (matches[wmdf_ts_col].isnull()) ] jm_workflow_col = jmdf_prefix + jm_dataset.col(Metric.WORKFLOW) wm_workflow_col = wmdf_prefix + wm_dataset.col(Metric.WORKFLOW) # Only accept jobs that match in their workflow matches = matches[matches[jm_workflow_col] == matches[wm_workflow_col]] return matches
def match_on_workflow(self, jmdf, wmdf, jmset: Dataset, wmset: Dataset, exclusion_limit=200): jm_grouped = jmdf.groupby(jmset.col(Metric.WORKFLOW)) wm_grouped = wmdf.groupby(wmset.col(Metric.WORKFLOW)) total_compared = 0 total = jmdf.shape[0] matches = {jmdf.index.name: [], wmdf.index.name: []} for key, jm_group in jm_grouped: try: wm_group = wm_grouped.get_group(key) except KeyError: # Group is not present in other frame continue if jm_group.empty or wm_group.empty: continue # Todo Maybe also compare large groups? if exclusion_limit > 0 and len(jm_group) > exclusion_limit: continue logging.debug("Checking for matches in {} WM, {} JM, ".format( len(wm_group), len(jm_group))) self._prefix_columns(jm_group, 'jmdf_') self._prefix_columns(wm_group, 'wmdf_') group_match_count = 0 for jm_index, jm_job in jm_group.iterrows(): matching_entries = self._filter_by_timestamp( jm_job, wm_group, jmset, wmset, left_prefix='jmdf_', right_prefix='wmdf_') if len(matching_entries) == 1: # Perfect match found, insert into match list # The index of the JM entry is already the label of the group matches.get(jmdf.index.name).append(jm_index) # Get the index of the only element in the data frame matches.get(wmdf.index.name).append( matching_entries.index.values[0]) group_match_count += 1 total_compared += len(jm_group) logging.debug("Found {} matches (of {} WM, {} JM, ".format( group_match_count, len(wm_group), len(jm_group)) + "{:.4}% compared).".format(100 * total_compared / total)) match_df = pd.DataFrame.from_dict(matches) return match_df