Пример #1
0
    def _match_on_cpu_time(self,
                           jm_dataset: Dataset,
                           wm_dataset: Dataset,
                           jm_subset=None,
                           wm_subset=None):
        jmdf = jm_subset if jm_subset is not None else jm_dataset.df
        wmdf = wm_subset if wm_subset is not None else wm_dataset.df

        # Round CPU time to account for rounding errors while matching float values
        jmdf['cpuApprox'] = jmdf[jm_dataset.col(Metric.CPU_TIME)].round()
        wmdf['cpuApprox'] = wmdf[wm_dataset.col(Metric.CPU_TIME)].round()

        jmdf_index = jmdf.index.name
        wmdf_index = wmdf.index.name

        self._prefix_columns(jmdf, 'jmdf_')
        self._prefix_columns(wmdf, 'wmdf_')

        matches = jmdf.reset_index().merge(wmdf.reset_index(),
                                           left_on='jmdf_cpuApprox',
                                           right_on='wmdf_cpuApprox')

        filtered = self._filter_matches(matches,
                                        jm_dataset,
                                        wm_dataset,
                                        jmdf_prefix='jmdf_',
                                        wmdf_prefix='wmdf_')

        perfect_matches = filtered.groupby(jmdf_index).filter(
            lambda x: len(x) == 1)

        return perfect_matches[[jmdf_index, wmdf_index]]
Пример #2
0
    def _filter_matches(self,
                        matches,
                        jm_dataset: Dataset,
                        wm_dataset,
                        jmdf_prefix='jmdf_',
                        wmdf_prefix='wmdf_'):

        timestamp_metrics = [Metric.START_TIME, Metric.STOP_TIME]

        for metric in timestamp_metrics:
            jmdf_ts_col = jmdf_prefix + jm_dataset.col(metric)
            wmdf_ts_col = wmdf_prefix + wm_dataset.col(metric)
            matches = matches[
                (self._timestamp_diff_series(matches[jmdf_ts_col],
                                             matches[wmdf_ts_col]) <
                 self.timestamp_tolerance)
                # |
                # (matches[jmdf_ts_col].isnull()) | (matches[wmdf_ts_col].isnull())
            ]

        jm_workflow_col = jmdf_prefix + jm_dataset.col(Metric.WORKFLOW)
        wm_workflow_col = wmdf_prefix + wm_dataset.col(Metric.WORKFLOW)

        # Only accept jobs that match in their workflow
        matches = matches[matches[jm_workflow_col] == matches[wm_workflow_col]]

        return matches
Пример #3
0
    def match_on_workflow(self,
                          jmdf,
                          wmdf,
                          jmset: Dataset,
                          wmset: Dataset,
                          exclusion_limit=200):
        jm_grouped = jmdf.groupby(jmset.col(Metric.WORKFLOW))
        wm_grouped = wmdf.groupby(wmset.col(Metric.WORKFLOW))

        total_compared = 0
        total = jmdf.shape[0]

        matches = {jmdf.index.name: [], wmdf.index.name: []}

        for key, jm_group in jm_grouped:
            try:
                wm_group = wm_grouped.get_group(key)
            except KeyError:
                # Group is not present in other frame
                continue

            if jm_group.empty or wm_group.empty:
                continue

            # Todo Maybe also compare large groups?
            if exclusion_limit > 0 and len(jm_group) > exclusion_limit:
                continue

            logging.debug("Checking for matches in {} WM, {} JM, ".format(
                len(wm_group), len(jm_group)))

            self._prefix_columns(jm_group, 'jmdf_')
            self._prefix_columns(wm_group, 'wmdf_')

            group_match_count = 0
            for jm_index, jm_job in jm_group.iterrows():

                matching_entries = self._filter_by_timestamp(
                    jm_job,
                    wm_group,
                    jmset,
                    wmset,
                    left_prefix='jmdf_',
                    right_prefix='wmdf_')

                if len(matching_entries) == 1:
                    # Perfect match found, insert into match list

                    # The index of the JM entry is already the label of the group
                    matches.get(jmdf.index.name).append(jm_index)

                    # Get the index of the only element in the data frame
                    matches.get(wmdf.index.name).append(
                        matching_entries.index.values[0])

                    group_match_count += 1

            total_compared += len(jm_group)
            logging.debug("Found {} matches (of {} WM, {} JM, ".format(
                group_match_count, len(wm_group), len(jm_group)) +
                          "{:.4}% compared).".format(100 * total_compared /
                                                     total))

        match_df = pd.DataFrame.from_dict(matches)
        return match_df