def build_review_data(self, data_path=None, output=None): if not data_path: data_path = self.options['data_path'] ds_train = DataFrame.create_dataframe(data_path) all_files = fsclient.list_folder(os.path.join( self.model_path, "predictions/*_actuals.feather.zstd"), wild=True, remove_folder_name=False, meta_info=True) all_files.sort(key=lambda f: f['last_modified'], reverse=True) for (file, ds_actuals) in DataFrame.load_from_files(all_files): if not ds_actuals.df.empty: ds_actuals.drop(['prediction_id', 'prediction_group_id']) ds_train.df = pd.concat( [ds_train.df, ds_actuals.df[ds_train.columns]], ignore_index=True) ds_train.drop_duplicates() if not output: output = os.path.splitext( data_path)[0] + "_review_%s.feather.zstd" % (get_uid()) ds_train.saveToFile(output) return output
def count_actuals_by_prediction_id(self): res = {} features = [ 'prediction_group_id', 'prediction_id', self.target_feature ] counter = ProbabilisticCounter() all_files = fsclient.list_folder(os.path.join( self.model_path, "predictions/*_actuals.feather.zstd"), wild=True, remove_folder_name=False, meta_info=False) for (file, df) in DataFrame.load_from_files(all_files, features): ModelReview._remove_duplicates_by(df, 'prediction_id', counter) agg = df.df.groupby(['prediction_group_id', 'prediction_id']).count() agg[self. target_feature] = 1 # exclude duplication prediction_id's inside groups agg = agg.groupby('prediction_group_id').count() for prediction_group_id, row, in agg.iterrows(): count = row[0] if prediction_group_id not in res: res[prediction_group_id] = count else: res[prediction_group_id] = res[prediction_group_id] + count return res
def _prediction_files_by_day(model_path, date_from, date_to, path_suffix): if date_from: date_from = convert_to_date(date_from) date_to = convert_to_date(date_to) curr_date = date_from while curr_date <= date_to: path = os.path.join( model_path, "predictions/" + str(curr_date) + path_suffix) files = fsclient.list_folder(path, wild=True, remove_folder_name=False, meta_info=False) yield (curr_date, files) curr_date += datetime.timedelta(days=1) else: path = os.path.join(model_path, "predictions/" + "*" + path_suffix) files = fsclient.list_folder(path, wild=True, remove_folder_name=False, meta_info=False) yield ("today", files)
def download_file(remote_path, local_dir, file_name, force_download=False): local_file_path = "" download_file = True remote_file_info = {} logging.info("download_file: %s, %s, %s, %s" % (remote_path, local_dir, file_name, force_download)) if file_name: all_local_files = fsclient.list_folder(os.path.join( local_dir, file_name + ".*"), wild=True, remove_folder_name=True) if all_local_files: local_file_path = os.path.join(local_dir, all_local_files[0]) if not local_file_path: remote_file_info = get_remote_file_info(remote_path) if not remote_file_info: raise Exception("Remote path does not exist or unaccessible: %s" % (remote_path)) if file_name: local_file_path = os.path.join( local_dir, file_name + remote_file_info.get('file_ext')) else: local_file_path = os.path.join( local_dir, remote_file_info.get('file_name') + remote_file_info.get('file_ext')) if fsclient.isFileExists(local_file_path): etag_changed = False file_size_changed = False if force_download: logging.info("Force download file again.") if force_download or etag_changed or file_size_changed: fsclient.remove_file(local_file_path) else: download_file = False if download_file: logging.info("Download to local file path: %s" % local_file_path) fsclient.download_file(remote_path, local_file_path) return local_file_path
def _get_prediction_files(model_path, prediction_group_id=None): predictions_path = os.path.join(model_path, "predictions/*_results.feather.zstd") if prediction_group_id: predictions_path = os.path.join( model_path, f"predictions/*_{prediction_group_id}_results.feather.zstd") files = fsclient.list_folder(predictions_path, wild=True, remove_folder_name=False, meta_info=True) files.sort(key=lambda f: f['last_modified'], reverse=True) if len(files) == 0: raise Exception( 'there is no prediction results for this model in ' + predictions_path) return files