Пример #1
0
    def build_review_data(self, data_path=None, output=None):
        if not data_path:
            data_path = self.options['data_path']

        ds_train = DataFrame.create_dataframe(data_path)

        all_files = fsclient.list_folder(os.path.join(
            self.model_path, "predictions/*_actuals.feather.zstd"),
                                         wild=True,
                                         remove_folder_name=False,
                                         meta_info=True)
        all_files.sort(key=lambda f: f['last_modified'], reverse=True)

        for (file, ds_actuals) in DataFrame.load_from_files(all_files):
            if not ds_actuals.df.empty:
                ds_actuals.drop(['prediction_id', 'prediction_group_id'])

                ds_train.df = pd.concat(
                    [ds_train.df, ds_actuals.df[ds_train.columns]],
                    ignore_index=True)
                ds_train.drop_duplicates()

        if not output:
            output = os.path.splitext(
                data_path)[0] + "_review_%s.feather.zstd" % (get_uid())

        ds_train.saveToFile(output)
        return output
Пример #2
0
    def count_actuals_by_prediction_id(self):
        res = {}
        features = [
            'prediction_group_id', 'prediction_id', self.target_feature
        ]
        counter = ProbabilisticCounter()

        all_files = fsclient.list_folder(os.path.join(
            self.model_path, "predictions/*_actuals.feather.zstd"),
                                         wild=True,
                                         remove_folder_name=False,
                                         meta_info=False)

        for (file, df) in DataFrame.load_from_files(all_files, features):
            ModelReview._remove_duplicates_by(df, 'prediction_id', counter)

            agg = df.df.groupby(['prediction_group_id',
                                 'prediction_id']).count()
            agg[self.
                target_feature] = 1  # exclude duplication prediction_id's inside groups
            agg = agg.groupby('prediction_group_id').count()

            for prediction_group_id, row, in agg.iterrows():
                count = row[0]

                if prediction_group_id not in res:
                    res[prediction_group_id] = count
                else:
                    res[prediction_group_id] = res[prediction_group_id] + count

        return res
Пример #3
0
    def _prediction_files_by_day(model_path, date_from, date_to, path_suffix):
        if date_from:
            date_from = convert_to_date(date_from)
            date_to = convert_to_date(date_to)

            curr_date = date_from

            while curr_date <= date_to:
                path = os.path.join(
                    model_path, "predictions/" + str(curr_date) + path_suffix)
                files = fsclient.list_folder(path,
                                             wild=True,
                                             remove_folder_name=False,
                                             meta_info=False)
                yield (curr_date, files)
                curr_date += datetime.timedelta(days=1)
        else:
            path = os.path.join(model_path, "predictions/" + "*" + path_suffix)
            files = fsclient.list_folder(path,
                                         wild=True,
                                         remove_folder_name=False,
                                         meta_info=False)
            yield ("today", files)
Пример #4
0
def download_file(remote_path, local_dir, file_name, force_download=False):
    local_file_path = ""
    download_file = True
    remote_file_info = {}

    logging.info("download_file: %s, %s, %s, %s" %
                 (remote_path, local_dir, file_name, force_download))
    if file_name:
        all_local_files = fsclient.list_folder(os.path.join(
            local_dir, file_name + ".*"),
                                               wild=True,
                                               remove_folder_name=True)
        if all_local_files:
            local_file_path = os.path.join(local_dir, all_local_files[0])

    if not local_file_path:
        remote_file_info = get_remote_file_info(remote_path)
        if not remote_file_info:
            raise Exception("Remote path does not exist or unaccessible: %s" %
                            (remote_path))

        if file_name:
            local_file_path = os.path.join(
                local_dir, file_name + remote_file_info.get('file_ext'))
        else:
            local_file_path = os.path.join(
                local_dir,
                remote_file_info.get('file_name') +
                remote_file_info.get('file_ext'))

    if fsclient.isFileExists(local_file_path):
        etag_changed = False
        file_size_changed = False

        if force_download:
            logging.info("Force download file again.")

        if force_download or etag_changed or file_size_changed:
            fsclient.remove_file(local_file_path)
        else:
            download_file = False

    if download_file:
        logging.info("Download to local file path: %s" % local_file_path)
        fsclient.download_file(remote_path, local_file_path)

    return local_file_path
Пример #5
0
    def _get_prediction_files(model_path, prediction_group_id=None):
        predictions_path = os.path.join(model_path,
                                        "predictions/*_results.feather.zstd")

        if prediction_group_id:
            predictions_path = os.path.join(
                model_path,
                f"predictions/*_{prediction_group_id}_results.feather.zstd")

        files = fsclient.list_folder(predictions_path,
                                     wild=True,
                                     remove_folder_name=False,
                                     meta_info=True)
        files.sort(key=lambda f: f['last_modified'], reverse=True)

        if len(files) == 0:
            raise Exception(
                'there is no prediction results for this model in ' +
                predictions_path)

        return files