def produce_report(self): """Produce a report of the batch jobs.""" s3_prefix = 'reading_results/%s/logs/%s/' % (self.basename, self._job_queue) logger.info("Producing batch report for %s, from prefix %s." % (self.basename, s3_prefix)) s3 = boto3.client('s3') file_tree = get_s3_file_tree(s3, bucket_name, s3_prefix) logger.info("Found %d relevant files." % len(file_tree)) stat_files = { 'git_info.txt': (self._handle_git_info, self._report_git_info), 'timing.txt': (self._handle_timing, self._report_timing), 'raw_tuples.pkl': (None, None), 'hist_data.pkl': (self._handle_hist_data, self._report_hist_data), 'sum_data.pkl': (self._handle_sum_data, self._report_sum_data) } stat_aggs = {} for stat_file, (handle_stats, report_stats) in stat_files.items(): logger.info("Aggregating %s..." % stat_file) # Prep the data storage. my_agg = {} # Get a list of the relevant files (one per job). file_paths = file_tree.get_paths(stat_file) logger.info("Found %d files for %s." % (len(file_paths), stat_file)) # Aggregate the data from all the jobs for each file type. for sub_path, file_entry in file_paths: s3_key = file_entry['key'] ref = sub_path[0] file = s3.get_object(Bucket=bucket_name, Key=s3_key) file_bytes = file['Body'].read() if handle_stats is not None: handle_stats(ref, my_agg, file_bytes) if report_stats is not None and len(my_agg): report_stats(my_agg) stat_aggs[stat_file] = my_agg for end_type, jobs in self.run_record.items(): self.reporter.add_text('Jobs %s: %d' % (end_type, len(jobs)), section='Totals') s3_prefix = 'reading_results/%s/' % self.basename fname = self.reporter.make_report() with open(fname, 'rb') as f: s3.put_object(Bucket=bucket_name, Key=s3_prefix + fname, Body=f.read()) s3.put_object(Bucket=bucket_name, Key=s3_prefix + 'stat_aggregates_%s.pkl' % self.time_tag, Body=pickle.dumps(stat_aggs)) return file_tree, stat_aggs
def sort_s3_files_by_last_mod(bucket, prefix, time_delta=None, extension=None, unsigned=True, reverse=False, w_dt=False): """Return a list of s3 object keys sorted by their LastModified date on S3 Parameters ---------- bucket : str s3 bucket to look for keys in prefix : str The prefix to use for the s3 keys time_delta : Optional[datetime.timedelta] If used, should specify how far back the to look for files on s3. Default: None extension : Optional[str] If used, limit keys to those with the matching file extension. Default: None. unsigned : bool If True, use unsigned s3 client. Default: True. reverse : bool Reverse the sort order of the returned s3 files. Default: False. w_dt : bool If True, return list with datetime object along with key as tuple (key, datetime.datetime). Default: False. Returns ------- list A list of s3 keys. If w_dt is True, each item is a tuple of (key, datetime.datetime) of the LastModified date. """ if time_delta is None: time_delta = timedelta() # zero timedelta s3 = get_s3_client(unsigned) n_hours_ago = datetime.utcnow() - time_delta file_tree = get_s3_file_tree(s3, bucket, prefix, date_cutoff=n_hours_ago, with_dt=True) key_list = sorted(list(file_tree.get_leaves()), key=lambda t: t[1], reverse=reverse) if extension: return [ t if w_dt else t[0] for t in key_list if t[0].endswith(extension) ] else: return key_list if w_dt else [t[0] for t in key_list]
def get_latest_pa_stmt_dump(): s3_cli = get_s3_client(False) # Get file key dump_name = 'full_pa_stmts.pkl' file_tree = get_s3_file_tree(s3=s3_cli, bucket=DUMPS_BUCKET, prefix=DUMPS_PREFIX, with_dt=True) # Get all keys for dump_name keys = [key for key in file_tree.gets('key') if key[0].endswith(dump_name)] keys.sort(key=itemgetter(1)) # Sorts ascending by datetime return load_pickle_from_s3(s3_cli, keys[-1][0], DUMPS_BUCKET)
def get_latest_sif_s3(get_mesh_ids: bool = False) \ -> Union[Tuple[Any, str], Tuple[Tuple[Any, str], Tuple[Any, str]]]: necc_files = [mngr.name for mngr in dumpers] if get_mesh_ids: necc_files.append(StatementHashMeshId.name) s3 = get_s3_client(unsigned=False) tree = get_s3_file_tree(s3, bucket=DUMPS_BUCKET, prefix=DUMPS_PREFIX, with_dt=True) # Find all pickles and jsons keys = [ key for key in tree.gets('key') if key[0].endswith(('.pkl', '.json')) ] # Sort newest first keys.sort(key=lambda t: t[1], reverse=True) # Get keys of those pickles keys_in_latest_dir = \ [k[0] for k in keys if any(nfl in k[0] for nfl in necc_files)] # Map key to resource necc_keys = {} for n in necc_files: for k in keys_in_latest_dir: # check name then alt name if n in k: # Save and continue to next file in necc_files necc_keys[n] = k break logger.info(f'Latest files: {", ".join([f for f in necc_keys.values()])}') df = load_pickle_from_s3(s3, key=necc_keys[Sif.name], bucket=DUMPS_BUCKET) sif_date = _get_date_from_s3_key(necc_keys[Sif.name]) if get_mesh_ids: mid = load_pickle_from_s3(s3, key=necc_keys[StatementHashMeshId.name], bucket=DUMPS_BUCKET) meshids_date = _get_date_from_s3_key( necc_keys[StatementHashMeshId.name]) return (df, sif_date), (mid, meshids_date) return df, sif_date
def get_logs_from_s3(folder=None, cached=True, past_days=None): """Download logs from S3 and save into a local folder Parameters ---------- folder : str The directory where to put the processed reports cached : str Look for already existing folders and skip those that exist past_days : int|datetime.datetime Either an integer or an instance of a datetime.datetime object specifying the number of days into the past to download logs for. If nothing is specified (default), all logs are downloaded. Default: None. Returns ------- dir_set : set A set containing the dir paths of all the requested logs """ s3 = get_s3_client(unsigned=False) if past_days: days_ago = past_days if isinstance(past_days, datetime) else\ ((datetime.utcnow() - timedelta(days=past_days)).replace( tzinfo=timezone.utc) if isinstance(past_days, int) else None) else: days_ago = None tree = get_s3_file_tree(s3, 'cwc-hms', 'bob_ec2_logs', days_ago) keys = tree.gets('key') # Here we only get the tar.gz files which contain the logs for the # facilitator + the json file (if present) of the user data logger.info('Total number of objects: %d ' % len(keys)) logger.info('Total number of images found: %d' % len([k for k in keys if 'image' in k])) keys = [key for key in keys if key.startswith('bob_ec2_logs/') and key.endswith(('.tar.gz', '.json', '.log'))] logger.info('Number of archives: %d' % len(keys)) fname_patt = re.compile( '([\w:-]+?)_(\w+?)_(\w+?_\w+?)_(.*).(tar\.gz|json|\.log)' ) dir_set = set() for key in tqdm.tqdm(keys): fname = os.path.basename(key) m = fname_patt.match(fname) if m is None: logger.warning("File name %s failed to match %s. Skipping..." % (fname, fname_patt)) continue image_id, cont_hash, cont_name, resource_name, suffix = m.groups() head_dir_path = '%s_%s_%s' % (image_id.replace(':', '-'), cont_name, cont_hash) dir_set.add(head_dir_path) if folder: head_dir_path = os.path.join(folder, head_dir_path) if not os.path.exists(head_dir_path): os.makedirs(head_dir_path, exist_ok=True) if resource_name == 'bioagent_images': outpath = head_dir_path else: outpath = os.path.join(head_dir_path, 'log.txt') if cached and os.path.exists(outpath) and\ not key.endswith(('.json', '.log')): continue tgz_file_name = key.split('/')[-1] tgz_file = os.path.join(head_dir_path, tgz_file_name) res = s3.get_object(Bucket='cwc-hms', Key=key) # byte_stream = BytesIO(res['Body'].read()) byte_stream = res['Body'].read() with open(tgz_file, 'wb') as tf: tf.write(byte_stream) # Re-open file if tgz_file.endswith(('.json', '.log')): continue with open(tgz_file, 'rb') as file_byte_stream: with tarfile.open(None, 'r', fileobj=file_byte_stream) as tarf: if resource_name == 'bioagent_images': tarf.extractall(outpath) else: outpaths = tarf.getnames() facls = [n for n in outpaths if n.endswith('facilitator.log')] if not facls: logger.info('No facilitator.log found for %s' % key) continue facl = facls[0] efo = tarf.extractfile(facl) log_txt = efo.read().decode('utf-8') with open(outpath, 'w') as fh: fh.write(log_txt) return dir_set