예제 #1
0
def fetch_web_logs(dbconfig, env, outdir, begin, stop):
    """
    Connect to weblog storage location and move weblogs locally

    :param dbconfig: database connection info (host, port, username, password)
    :param env: dev/tst/ops (to get hostname of the external download servers)
    :param outdir: location to save log files
    :param begin: timestamp to begin searching the logs
    :param stop: timestamp to stop searching the logs
    :return: None
    """
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    dmzinfo = utils.query_connection_info(dbconfig, env)
    for log_loc in dmzinfo['log_locs']:
        host, remote_dir = log_loc.split(':')
        client = utils.RemoteConnection(host,
                                        user=dmzinfo['username'],
                                        password=dmzinfo['password'])
        files = client.list_remote_files(remote_dir=remote_dir,
                                         prefix=LOG_FILENAME)
        files = utils.subset_by_date(files, begin, stop, LOG_FILE_TIMESTAMP)
        for remote_path in files:
            filename = ("{host}_{fname}".format(
                host=host, fname=os.path.basename(remote_path)))
            local_path = os.path.join(outdir, filename)
            if not os.path.exists(local_path):
                client.download_remote_file(remote_path=remote_path,
                                            local_path=local_path)
예제 #2
0
def parse_dls(log_glob, start_date, end_date, resource_regex):
    """
    Count the total tarballs downloaded from /orders/ and their combined size

    :param log_glob: Glob for Log Format file path (e.g. '/path/to/logs*')
    :type log_glob: str
    :param start_date: Compares >= timestamp in log
    :type start_date: datetime.date
    :param end_date: Compares <= timestamp in log
    :type end_date: datetime.date

    """
    infodict = {'tot_dl': 0, 'tot_vol': 0.0}
    bytes_in_a_gb = 1073741824.0

    files = glob.glob(log_glob)
    if len(files) < 1:
        raise IOError('Could not find %s' % log_glob)
    files = utils.subset_by_date(files, start_date, end_date,
                                 LOG_FILE_TIMESTAMP)
    if len(files) < 1:
        raise RuntimeError('No files found in date range: %s' % log_glob)

    order_paths = set()
    for log_file in files:
        print('* Parse: {}'.format(log_file))
        with gzip.open(log_file) as log:
            for line in log:
                gr = filter_log_line(line, start_date, end_date)
                if gr:
                    if get_sensor_name(gr['resource']) not in sensors:
                        # Difficult to say if statistics should be counted...
                        # if not gr['resource'].endswith('statistics.tar.gz'):
                        continue
                    rparts = gr['resource'].split('/')
                    if len(rparts) != 4:
                        raise ValueError('Unexpected directory structure: %s' %
                                         rparts)
                    elif rparts[1] not in valid_orderids:
                        continue
                    infodict['tot_vol'] += int(gr['size'])
                    infodict['tot_dl'] += 1
                    order_paths.add(gr['resource'])

    # Bytes to GB
    infodict['tot_vol'] /= bytes_in_a_gb

    return infodict, list(order_paths)
예제 #3
0
def calc_dlinfo(log_glob, start_date, end_date, sensors):
    """
    Count the total tarballs downloaded from /orders/ and their combined size

    :param log_glob: Glob for Log Format file path (e.g. '/path/to/logs*')
    :type log_glob: str
    :param start_date: Compares >= timestamp in log
    :type start_date: datetime.date
    :param end_date: Compares <= timestamp in log
    :type end_date: datetime.date
    :param sensors: which sensors to process (['tm4','etm7',...])
    :type sensors: tuple
    :return: Dictionary of values
    """
    infodict = {'tot_dl': 0,
                'tot_vol': 0.0}
    bytes_in_a_gb = 1073741824.0

    files = glob.glob(log_glob)
    if len(files) < 1:
        raise IOError('Could not find %s' % log_glob)
    files = utils.subset_by_date(files, start_date, end_date, LOG_FILE_TIMESTAMP)
    if len(files) < 1:
        raise RuntimeError('No files found in date range: %s' % log_glob)

    order_paths = set()
    for log_file in files:
        print('* Parse: {}'.format(log_file))
        with gzip.open(log_file) as log:
            for line in log:
                gr = filter_log_line(line, start_date, end_date)
                if gr:
                    if get_sensor_name(gr['resource']) not in sensors:
                        # Difficult to say if statistics should be counted...
                        # if not gr['resource'].endswith('statistics.tar.gz'):
                        continue
                    infodict['tot_vol'] += int(gr['size'])
                    infodict['tot_dl'] += 1
                    order_paths.add(gr['resource'])

    # Bytes to GB
    infodict['tot_vol'] /= bytes_in_a_gb

    return infodict, list(order_paths)