def fetch_web_logs(dbconfig, env, outdir, begin, stop): """ Connect to weblog storage location and move weblogs locally :param dbconfig: database connection info (host, port, username, password) :param env: dev/tst/ops (to get hostname of the external download servers) :param outdir: location to save log files :param begin: timestamp to begin searching the logs :param stop: timestamp to stop searching the logs :return: None """ if not os.path.exists(outdir): os.makedirs(outdir) dmzinfo = utils.query_connection_info(dbconfig, env) for log_loc in dmzinfo['log_locs']: host, remote_dir = log_loc.split(':') client = utils.RemoteConnection(host, user=dmzinfo['username'], password=dmzinfo['password']) files = client.list_remote_files(remote_dir=remote_dir, prefix=LOG_FILENAME) files = utils.subset_by_date(files, begin, stop, LOG_FILE_TIMESTAMP) for remote_path in files: filename = ("{host}_{fname}".format( host=host, fname=os.path.basename(remote_path))) local_path = os.path.join(outdir, filename) if not os.path.exists(local_path): client.download_remote_file(remote_path=remote_path, local_path=local_path)
def parse_dls(log_glob, start_date, end_date, resource_regex): """ Count the total tarballs downloaded from /orders/ and their combined size :param log_glob: Glob for Log Format file path (e.g. '/path/to/logs*') :type log_glob: str :param start_date: Compares >= timestamp in log :type start_date: datetime.date :param end_date: Compares <= timestamp in log :type end_date: datetime.date """ infodict = {'tot_dl': 0, 'tot_vol': 0.0} bytes_in_a_gb = 1073741824.0 files = glob.glob(log_glob) if len(files) < 1: raise IOError('Could not find %s' % log_glob) files = utils.subset_by_date(files, start_date, end_date, LOG_FILE_TIMESTAMP) if len(files) < 1: raise RuntimeError('No files found in date range: %s' % log_glob) order_paths = set() for log_file in files: print('* Parse: {}'.format(log_file)) with gzip.open(log_file) as log: for line in log: gr = filter_log_line(line, start_date, end_date) if gr: if get_sensor_name(gr['resource']) not in sensors: # Difficult to say if statistics should be counted... # if not gr['resource'].endswith('statistics.tar.gz'): continue rparts = gr['resource'].split('/') if len(rparts) != 4: raise ValueError('Unexpected directory structure: %s' % rparts) elif rparts[1] not in valid_orderids: continue infodict['tot_vol'] += int(gr['size']) infodict['tot_dl'] += 1 order_paths.add(gr['resource']) # Bytes to GB infodict['tot_vol'] /= bytes_in_a_gb return infodict, list(order_paths)
def calc_dlinfo(log_glob, start_date, end_date, sensors): """ Count the total tarballs downloaded from /orders/ and their combined size :param log_glob: Glob for Log Format file path (e.g. '/path/to/logs*') :type log_glob: str :param start_date: Compares >= timestamp in log :type start_date: datetime.date :param end_date: Compares <= timestamp in log :type end_date: datetime.date :param sensors: which sensors to process (['tm4','etm7',...]) :type sensors: tuple :return: Dictionary of values """ infodict = {'tot_dl': 0, 'tot_vol': 0.0} bytes_in_a_gb = 1073741824.0 files = glob.glob(log_glob) if len(files) < 1: raise IOError('Could not find %s' % log_glob) files = utils.subset_by_date(files, start_date, end_date, LOG_FILE_TIMESTAMP) if len(files) < 1: raise RuntimeError('No files found in date range: %s' % log_glob) order_paths = set() for log_file in files: print('* Parse: {}'.format(log_file)) with gzip.open(log_file) as log: for line in log: gr = filter_log_line(line, start_date, end_date) if gr: if get_sensor_name(gr['resource']) not in sensors: # Difficult to say if statistics should be counted... # if not gr['resource'].endswith('statistics.tar.gz'): continue infodict['tot_vol'] += int(gr['size']) infodict['tot_dl'] += 1 order_paths.add(gr['resource']) # Bytes to GB infodict['tot_vol'] /= bytes_in_a_gb return infodict, list(order_paths)