def run_taro_experiments():
    experiments = get_taro_experiments()
    completed_experiment_procs = []
    logging.info('Going to run {} experiments.'.format(len(experiments)))

    running_experiment = 1

    for experiment in experiments.values():
        try:
            print('Running experiments {}/{}'.format(running_experiment,
                                                     len(experiments)))
            cctestbed.run_local_command(
                '/opt/bess/bessctl/bessctl daemon stop')
            proc = experiment.run()
            completed_experiment_procs.append(proc)
            running_experiment += 1
        except Exception as e:
            print('ERROR RUNNING EXPERIMENT: {}'.format(e))

    for proc in completed_experiment_procs:
        logging.info('Waiting for subprocess to finish PID={}'.format(
            proc.pid))
        proc.wait()
        if proc.returncode != 0:
            logging.warning('Error running cmd PID={}'.format(proc.pid))
Exemplo n.º 2
0
def start_bess_for_local_video(exp, duration):
    with ExitStack() as stack:
        exp._run_tcpdump('server', stack)
        exp._run_tcpdump('server', stack, capture_http=True)
        cctestbed.stop_bess()
        stack.enter_context(exp._run_bess(
            ping_source='client',
            skip_ping=False,
            bess_config_name='active-middlebox-pmd-fairness'))
        # give bess time to start
        time.sleep(5)
        exp._show_bess_pipeline()
        stack.enter_context(exp._run_bess_monitor())
        util.start_apache_server(exp.flows[0])
        video_flow = start_single_local_video_flow(exp.flows[0], exp, stack)
        logging.info('Waiting for flow to finish')
        # wait for flow to finish
        video_flow._wait()
        # add add a time buffer before finishing up experiment
        logging.info('Video flow finished')
        # add add a time buffer before finishing up experiment
        time.sleep(5)
        exp._show_bess_pipeline()
        cmd = '/opt/bess/bessctl/bessctl command module queue0 get_status EmptyArg'
        print(cctestbed.run_local_command(cmd))
        util.stop_local_server_and_cleanup(exp)
Exemplo n.º 3
0
def get_ping_rtt(instance_ip):
    #cmd = "nping --icmp -v-1 -H -c 5 {} | grep -oP 'Avg rtt:\s+\K.*(?=ms)'".format(instance_ip)
    cmd = 'ping -c 5 {} | tail -1 | awk "{{print $4}}" '.format(instance_ip),
    line = cctestbed.run_local_command(cmd, shell=True)
    print("get ping rtt: ", line)
    rtt = float(line.split('=')[-1].split('/')[1])
    return rtt
Exemplo n.º 4
0
def get_website_ip(url):
    url_parts = list(urlsplit(url.strip()))
    hostname = url_parts[1]
    ip_addrs = cctestbed.run_local_command(
        "nslookup {} | awk '/^Address: / {{ print $2 ; exit }}'".format(hostname), shell=True)
    ip_addr = ip_addrs.split('\n')[0]
    if ip_addr.strip() == '':
        raise ValueError('Could not find IP addr for {}'.format(url))
    return ip_addr
def get_video_server_host(url):
    videoUrls = cctestbed.run_local_command(
        "youtube-dl --youtube-skip-dash-manifest -g {}".format(url),
        shell=True)
    # print(videoUrls)
    video_url = videoUrls.split('\n')[0]
    audio_url = videoUrls.split('\n')[1]
    print("Video URL: ", video_url)
    print("Audio URL: ", audio_url)
    url_parts = list(urlsplit(video_url.strip()))
    hostname = url_parts[1]
    print("video hostname {}", hostname)
    return hostname
def get_video_server_ip(hostname):
    video_ip_addrs = cctestbed.run_local_command(
        "nslookup {} | awk '/^Address: / {{ print $2 ; exit }}'".format(
            hostname),
        shell=True)

    print("Video IPs: ", video_ip_addrs)

    video_ip_addr = video_ip_addrs.split('\n')[0]
    print("ipaddress of video server {}", video_ip_addr)
    if video_ip_addr.strip() == '':
        raise ValueError('Could not find IP addr for {}'.format(video_url))
    return video_ip_addr
def start_bess_for_combinatory_flows(experiment, includeWebsite, duration):
    with ExitStack() as stack:

        util.prerequisite_for_combinatory_tests(experiment, stack,
                                                includeWebsite)

        ssh_client = util.get_ssh_client_for_server_node(experiment)
        with ssh_client as ssh_client:

            local_web_service_required, single_local_flow_details = util.isApacheNeeded(
                experiment)
            print("local_web_service_required & single_local_flow_details>",
                  local_web_service_required, single_local_flow_details)
            if local_web_service_required:
                print("Start Apache")
                util.start_apache_server(single_local_flow_details)

            #TODO:Measure RTT of third party services again after remaining delay is set

            #iperf
            flowImpl.start_iperf_flows(experiment, stack)
            #Web video
            contains_webvideo_flows = flowImpl.start_web_video_flows(
                experiment, stack)
            #Local video
            contains_localvideo_flows = flowImpl.start_local_video_flows(
                experiment, stack)
            #Local website
            contains_local_website_flows = flowImpl.start_local_website_flows(
                ssh_client, experiment, stack)
            #Website
            contains_website_flows = flowImpl.start_website_flows(
                ssh_client, experiment, stack)

            time.sleep(duration + 5)

        #Save website info onto a file
        if contains_webvideo_flows:
            clean_up_web_video(experiment, duration)

        if contains_webvideo_flows or contains_website_flows:
            util.write_webdata_to_log(experiment, duration)

        if contains_localvideo_flows or contains_local_website_flows:
            util.stop_local_server_and_cleanup(experiment)

        experiment._show_bess_pipeline()
        cmd = '/opt/bess/bessctl/bessctl command module queue0 get_status EmptyArg'
        print(cctestbed.run_local_command(cmd))
Exemplo n.º 8
0
def start_bess_for_iperf(exp, duration):
    with ExitStack() as stack:
        exp._run_tcpdump('server', stack)
        cctestbed.stop_bess()
        stack.enter_context(exp._run_bess(ping_source='client',
                                        skip_ping=False,
                                        bess_config_name='active-middlebox-pmd-fairness'))
        # give bess time to start
        time.sleep(5)
        exp._show_bess_pipeline()
        stack.enter_context(exp._run_bess_monitor())
        start_iperf_flows(exp, stack)
        time.sleep(duration+5)
        exp._show_bess_pipeline()
        cmd = '/opt/bess/bessctl/bessctl command module queue0 get_status EmptyArg'
        print(cctestbed.run_local_command(cmd))
Exemplo n.º 9
0
def start_bess_for_website(exp, duration, web_data):
    with ExitStack() as stack:
        print(web_data)
        stack.enter_context(util.add_dnat_rule(exp, web_data['url_ip']))
        stack.enter_context(util.add_route(exp, web_data['url_ip']))
        stack.enter_context(util.add_dns_rule(exp, web_data['website'], web_data['url_ip']))
        exp._run_tcpdump('server', stack)
        # run the flow
        # turns out there is a bug when using subprocess and Popen in Python 3.5
        # so skip ping needs to be true
        # https://bugs.python.org/issue27122
        cctestbed.stop_bess()
        stack.enter_context(exp._run_bess(ping_source='server', skip_ping=False, bess_config_name='active-middlebox-pmd-fairness'))
        # give bess some time to start
        time.sleep(5)
        exp._show_bess_pipeline()
        stack.enter_context(exp._run_bess_monitor())
        stack.enter_context(exp._run_rtt_monitor())
        ssh_client = cctestbed.get_ssh_client(exp.server.ip_wan, exp.server.username, key_filename=exp.server.key_filename)
        
        with ssh_client as ssh_client:
            start_website_flows(ssh_client, exp, stack)
            # exit_status = stdout.channel.recv_exit_status()
            time.sleep(duration+5)
        # flow_end_time = time.time()
        logging.info('Flow ran for {} seconds'.format(duration+5))

        exp._show_bess_pipeline()
        cmd = '/opt/bess/bessctl/bessctl command module queue0 get_status EmptyArg'
        print(cctestbed.run_local_command(cmd))

        logging.info('Dumping website data to log: {}'.format(exp.logs['website_log']))
        with open(exp.logs['website_log'], 'w') as f:
            website_info = {}
            website_info['website'] = web_data['website']
            website_info['url'] = web_data['url']
            website_info['website_rtt'] = web_data['website_rtt']
            website_info['experiment_rtt'] = web_data['experiment_rtt']
            website_info['delay'] = web_data['delay']
            website_info['url_ip'] = web_data['url_ip']
            website_info['flow_runtime'] = duration+5
            json.dump(website_info, f)
Exemplo n.º 10
0
def run_experiment(website, url, btlbw=10, queue_size=128, rtt=35, force=False):
    experiment_name = '{}bw-{}rtt-{}q-{}'.format(btlbw, rtt, queue_size, website)
    if not force and is_completed_experiment(experiment_name):
        return (None, '')
    logging.info('Creating experiment for website: {}'.format(website))
    url_ip = get_website_ip(url)
    logging.info('Got website IP: {}'.format(url_ip))
    website_rtt = int(float(get_nping_rtt(url_ip)))
    logging.info('Got website RTT: {}'.format(website_rtt))

    if website_rtt >= rtt:
        logging.warning('Skipping experiment with website RTT {} >= {}'.format(
            website_rtt, rtt))
        return (-1, '')

    client = HOST_CLIENT_TEMPLATE
    client['ip_wan'] = url_ip
    client = cctestbed.Host(**client)
    server = HOST_SERVER
    
    server_nat_ip = HOST_CLIENT.ip_wan #'128.104.222.182'  taro
    server_port = 5201
    client_port = 5555

    flow = {'ccalg': 'reno',
            'end_time': 60,
            'rtt': rtt - website_rtt,
            'start_time': 0}
    flows = [cctestbed.Flow(ccalg=flow['ccalg'], start_time=flow['start_time'],
                            end_time=flow['end_time'], rtt=flow['rtt'],
                            server_port=server_port, client_port=client_port,
                            client_log=None, server_log=None, kind='website',
                            client=client)]
    
    exp = cctestbed.Experiment(name=experiment_name,
                     btlbw=btlbw,
                     queue_size=queue_size,
                     flows=flows, server=server, client=client,
                     config_filename='experiments-all-ccalgs-aws.yaml',
                     server_nat_ip=server_nat_ip)
    
    logging.info('Running experiment: {}'.format(exp.name))

    # make sure tcpdump cleaned up
    logging.info('Making sure tcpdump is cleaned up')
    with cctestbed.get_ssh_client(
            exp.server.ip_wan,
            username=exp.server.username,
            key_filename=exp.server.key_filename) as ssh_client:
        cctestbed.exec_command(
            ssh_client,
            exp.client.ip_wan,
            'sudo pkill -9 tcpdump')
                        
    with ExitStack() as stack:
        # add DNAT rule
        stack.enter_context(add_dnat_rule(exp, url_ip))
        # add route to URL
        stack.enter_context(add_route(exp, url_ip))
        # add dns entry
        stack.enter_context(add_dns_rule(exp, website, url_ip))
        exp._run_tcpdump('server', stack)
        # run the flow
        # turns out there is a bug when using subprocess and Popen in Python 3.5
        # so skip ping needs to be true
        # https://bugs.python.org/issue27122
        cctestbed.stop_bess()
        stack.enter_context(exp._run_bess(ping_source='server', skip_ping=False))
        # give bess some time to start
        time.sleep(5)
        exp._show_bess_pipeline()
        stack.enter_context(exp._run_bess_monitor())
        stack.enter_context(exp._run_rtt_monitor())
        with cctestbed.get_ssh_client(exp.server.ip_wan,
                                      exp.server.username,
                                      key_filename=exp.server.key_filename) as ssh_client:
            filename = os.path.basename(url)
            if filename.strip() == '':
                logging.warning('Could not get filename from URL')
            start_flow_cmd = 'timeout 65s wget --no-check-certificate --no-cache --delete-after --connect-timeout=10 --tries=3 --bind-address {}  -P /tmp/ "{}" || rm -f /tmp/{}.tmp*'.format(exp.server.ip_lan, url, filename)
            # won't return until flow is done
            flow_start_time = time.time()
            _, stdout, _ = cctestbed.exec_command(ssh_client, exp.server.ip_wan, start_flow_cmd)
            exit_status = stdout.channel.recv_exit_status()
            flow_end_time = time.time()
            logging.info('Flow ran for {} seconds'.format(flow_end_time - flow_start_time))
        exp._show_bess_pipeline()
        cmd = '/opt/bess/bessctl/bessctl command module queue0 get_status EmptyArg'
        print(cctestbed.run_local_command(cmd))

        logging.info('Dumping website data to log: {}'.format(exp.logs['website_log']))
        with open(exp.logs['website_log'], 'w') as f:
            website_info = {}
            website_info['website'] = website
            website_info['url'] = url
            website_info['website_rtt'] = website_rtt
            website_info['url_ip'] = url_ip
            website_info['flow_runtime'] = flow_end_time - flow_start_time 
            json.dump(website_info, f)

        if exit_status != 0:
            if exit_status == 124: # timeout exit status
                print('Timeout. Flow longer than 65s.')
                logging.warning('Timeout. Flow longer than 65s.')
            else:
                logging.error(stdout.read())
                raise RuntimeError('Error running flow.')

    proc = exp._compress_logs_url()
    return (proc, '{}-{}'.format(experiment_name, exp.exp_time))
Exemplo n.º 11
0
def run_rtt_monitor(url_ip):
    cmd = "nping --delay 5s {} > {}  &".format(url_ip, '')
    rtt = cctestbed.run_local_command(cmd, shell=True)
    return rtt
Exemplo n.º 12
0
def get_nping_rtt(url_ip):
    cmd = "nping -v-1 -H -c 5 {} | grep -oP 'Avg rtt:\s+\K.*(?=ms)'".format(url_ip)
    rtt = cctestbed.run_local_command(cmd, shell=True)
    return rtt
Exemplo n.º 13
0
def run_experiment(website1,
                   url1,
                   website2,
                   url2,
                   btlbw=10,
                   queue_size=128,
                   rtt=35,
                   force=False):
    experiment_name = '{}bw-{}rtt-{}q-{}-{}'.format(btlbw, rtt, queue_size,
                                                    website1, website2)
    if not force and is_completed_experiment(experiment_name):
        return
    else:
        if ran_experiment_today(experiment_name):
            return
    logging.info('Creating experiment for website1: {} website2: {}'.format(
        website1, website2))
    url_ip1 = get_website_ip(url1)
    url_ip2 = get_website_ip(url2)
    logging.info('Got website1 IP: {} website2 IP: {}'.format(
        url_ip1, url_ip2))
    website_rtt1 = int(float(get_nping_rtt(url_ip1)))
    website_rtt2 = int(float(get_nping_rtt(url_ip2)))
    logging.info('Got website1 RTT: {} website2 RTT: {}'.format(
        website_rtt1, website_rtt2))

    if website_rtt1 >= rtt:
        logging.warning(
            'Skipping experiment with website1 RTT {} >= {}'.format(
                website_rtt1, rtt))
        return -1
    elif website_rtt2 >= rtt:
        logging.warning(
            'Skipping experiment with website2 RTT {} >= {}'.format(
                website_rtt2, rtt))
        return -1

    client = HOST_CLIENT_TEMPLATE
    # TODO: Which IP should be used for client?
    client['ip_wan'] = url_ip1
    client = cctestbed.Host(**client)
    server = HOST_SERVER

    server_nat_ip = HOST_CLIENT.ip_wan  #'128.104.222.182'  taro
    server_port = 5201
    client_port = 5555

    flow1 = {
        'ccalg': 'reno',
        'end_time': 60,
        'rtt': rtt - website_rtt1,
        'start_time': 0
    }
    flow2 = {
        'ccalg': 'reno',
        'end_time': 60,
        'rtt': rtt - website_rtt2,
        'start_time': 0
    }
    flows = [
        cctestbed.Flow(ccalg=flow1['ccalg'],
                       start_time=flow1['start_time'],
                       end_time=flow1['end_time'],
                       rtt=flow1['rtt'],
                       server_port=server_port,
                       client_port=client_port,
                       client_log=None,
                       server_log=None),
        cctestbed.Flow(ccalg=flow2['ccalg'],
                       start_time=flow2['start_time'],
                       end_time=flow2['end_time'],
                       rtt=flow2['rtt'],
                       server_port=server_port,
                       client_port=client_port,
                       client_log=None,
                       server_log=None),
    ]

    exp = cctestbed.Experiment(
        name=experiment_name,
        btlbw=btlbw,
        queue_size=queue_size,
        flows=flows,
        server=server,
        client=client,
        config_filename='experiments-all-ccalgs-aws.yaml',
        server_nat_ip=server_nat_ip)

    logging.info('Running experiment: {}'.format(exp.name))

    # make sure tcpdump cleaned up
    logging.info('Making sure tcpdump is cleaned up')
    with cctestbed.get_ssh_client(
            exp.server.ip_wan,
            username=exp.server.username,
            key_filename=exp.server.key_filename) as ssh_client:
        cctestbed.exec_command(ssh_client, exp.client.ip_wan,
                               'sudo pkill -9 tcpdump')

    with ExitStack() as stack:
        # add DNAT rule
        stack.enter_context(add_dnat_rule(exp, url_ip1))
        stack.enter_context(add_dnat_rule(exp, url_ip2))
        # add route to URL
        stack.enter_context(add_route(exp, url_ip1))
        stack.enter_context(add_route(exp, url_ip2))
        # add dns entry
        stack.enter_context(add_dns_rule(exp, website1, url_ip1))
        stack.enter_context(add_dns_rule(exp, website2, url_ip2))
        exp._run_tcpdump('server', stack)
        # run the flow
        # turns out there is a bug when using subprocess and Popen in Python 3.5
        # so skip ping needs to be true
        # https://bugs.python.org/issue27122
        cctestbed.stop_bess()
        stack.enter_context(
            exp._run_bess(ping_source='server', skip_ping=False))
        # give bess some time to start
        time.sleep(5)
        exp._show_bess_pipeline()
        stack.enter_context(exp._run_bess_monitor())
        stack.enter_context(exp._run_rtt_monitor())
        with cctestbed.get_ssh_client(
                exp.server.ip_wan,
                exp.server.username,
                key_filename=exp.server.key_filename) as ssh_client:
            filename1 = os.path.basename(url1)
            filename2 = os.path.basename(url2)
            if filename1.strip() == '':
                logging.warning('Could not get filename from URL 1')
            if filename2.strip() == '':
                logging.warning('Could not get filename from URL 2')
            # Start first flow in background and second in foreground
            start_flow_cmd1 = 'timeout 65s wget --no-cache --delete-after --connect-timeout=10 --tries=3 --bind-address {}  -P /tmp/ {} || rm -f /tmp/{}.tmp* &'.format(
                exp.server.ip_lan, url1, filename1)
            start_flow_cmd2 = 'timeout 65s wget --no-cache --delete-after --connect-timeout=10 --tries=3 --bind-address {}  -P /tmp/ {} || rm -f /tmp/{}.tmp*'.format(
                exp.server.ip_lan, url2, filename2)
            # won't return until flow is done
            flow_start_time = time.time()
            _, _, _ = cctestbed.exec_command(ssh_client, exp.server.ip_wan,
                                             start_flow_cmd1)
            _, stdout, _ = cctestbed.exec_command(ssh_client,
                                                  exp.server.ip_wan,
                                                  start_flow_cmd2)
            exit_status = stdout.channel.recv_exit_status()
            flow_end_time = time.time()
            logging.info('Flow ran for {} seconds'.format(flow_end_time -
                                                          flow_start_time))
        exp._show_bess_pipeline()
        cmd = '/opt/bess/bessctl/bessctl command module queue0 get_status EmptyArg'
        print(cctestbed.run_local_command(cmd))
        if exit_status != 0:
            if exit_status == 124:  # timeout exit status
                print('Timeout. Flow longer than 65s.')
                logging.warning('Timeout. Flow longer than 65s.')
            else:
                logging.error(stdout.read())
                raise RuntimeError('Error running flow.')
    proc = exp._compress_logs_url()
    return (proc, exp.tar_filename, experiment_name)
Exemplo n.º 14
0
def load_experiments(experiment_name_patterns, remote=True, force_local=False,
                        remote_username=REMOTE_USERNAME, remote_ip=REMOTE_IP_ADDR,
                        load_queue=False, clean=False, parallel=True,
                        min_num_files=0, min_date=None, remove_duplicates=True):
    """Load all experiments into experiment analyzers
    experiment_name_pattern : list of str
        Should be a pattern that will be called
        with '{}.tar.gz'.format(experiment_name_pattern)
    remote : bool, (default: True)
        If True, look for experiments remotely.
        If False, don't look for experiments remotely,
        only locally.
    force_local : bool, (default: False)
        If True, always look for local experiments.
        If False, only look for local experiments,
        if no remote experiments are found.
    clean: bool
        If True, delete all local files matching this exp_name_pattern
        before downloading again.
    parallel: bool
        If True, run download for experiments in parallel
    min_num_files: int
        If greater than 0, then expected to get atleast this number of files
    min_date: string
        Only return experiments with equal to or large than the expected date
    remove_duplicates: bool
        Remove experiments with the same name, keeping the most recent one
    """
    assert(type(experiment_name_patterns) is list)
    tarfile_remotepaths = []
    # i feel like this code is too dangerous since there is a rm command ...
    if clean:
        for experiment_name_pattern in experiment_name_patterns:
            print('Deleting local files matching experiment pattern: {}'.format(experiment_name_pattern))
            run_local_command('rm {}.h5'.format(os.path.join(DATAPATH_PROCESSED, experiment_name_pattern)))
    if remote:
        print('Searching for experiments on remote machine: {}'.format(remote_ip))
        with get_ssh_client(ip_addr=remote_ip, username=remote_username) as ssh_client:
            '''
            for experiment_name_pattern in experiment_name_patterns:
                _, stdout, _ = ssh_client.exec_command(
                    'ls -1 /tmp/{}.tar.gz'.format(experiment_name_pattern))
                tarfile_remotepaths += [filename.strip()
                                        for filename in stdout.readlines()]
            '''
            cmd = 'ls -1 ' + ' '.join(['/tmp/{}.tar.gz']*len(experiment_name_patterns)).format(*experiment_name_patterns)
            print(cmd)
            _, stdout, _ = ssh_client.exec_command(cmd)
            tarfile_remotepaths += [filename.strip() for filename in stdout.readlines()]
        print('Found {} experiment(s) on remote machine: {}'.format(
            len(tarfile_remotepaths), tarfile_remotepaths))
    else:
        print('Not searching remote machine for experiments.')

    if force_local or len(tarfile_remotepaths) == 0:
        num_local_files = 0
        for experiment_name_pattern in experiment_name_patterns:
            local_filepaths = glob.glob(os.path.join(DATAPATH_RAW,
                                                     experiment_name_pattern +'.tar.gz'))
            tarfile_remotepaths += local_filepaths
            num_local_files += len(local_filepaths)
        if len(tarfile_remotepaths) == 0:
            raise ValueError(('Found no experiments on remote or local machine '
                            '{} with name pattern {}').format(
                                remote_ip, experiment_name_pattern))
        if num_local_files > 0:
            print('Found {} experiment(s) on local machine: {}'.format(num_local_files,
                                                                        tarfile_remotepaths[-num_local_files:]))
        else:
            print('Found 0 experiment(s) on local machines.')

    if min_date is not None:
        # copy file so we iterate over list and modify it
        remotepaths = tarfile_remotepaths[:]
        num_wrong_date = 0
        for remotepath in remotepaths:
            date = os.path.basename(remotepath).split('-')[-1]
            if date < min_date:
                num_wrong_date += 1
                tarfile_remotepaths.remove(remotepath)
        if num_wrong_date > 0:
            print('Found {} experiment(s) with date smaller than {}.'.format(num_wrong_date, min_date))

    if remove_duplicates:
        # keep only most recent experiments with the same name
        tmp = pd.DataFrame(tarfile_remotepaths)
        num_duplicates = len(tmp)
        tarfile_remotepaths = tmp.loc[tmp[0].sort_values().apply(lambda x: '-'.join(x.split('-')[:-1])).drop_duplicates(keep='last').index][0].tolist()
        num_duplicates = num_duplicates - len(tarfile_remotepaths)
        if num_duplicates > 0:
            print('Found {} experiment(s) with duplicate prefixes.'.format(num_duplicates))


    if min_num_files > 0:
        if len(tarfile_remotepaths) < min_num_files:
            print('Wanted min number of {} experiment(s), but only found {}.'.format(min_num_files, len(tarfile_remotepaths)))
            tarfile_remotepaths = []

    #experiments = {}
    num_proc = 10
    num_tarfiles = len(tarfile_remotepaths)
    num_tarfiles_per_process = int(num_tarfiles / num_proc) + 1
    if parallel and num_tarfiles > 1:
            with mp.Pool(num_proc) as pool:
                analyzers = pool.starmap(get_experiment, zip(tarfile_remotepaths,
                                                            it.repeat(remote_ip, num_tarfiles),
                                                            it.repeat(remote_username, num_tarfiles),
                                                            it.repeat(load_queue, num_tarfiles)),
                                                            chunksize=num_tarfiles_per_process)
    else:
        analyzers = [get_experiment(tarfile_remotepath, remote_ip, remote_username, load_queue) for tarfile_remotepath in tarfile_remotepaths]
    experiment_analyzers = ExperimentAnalyzers()
    for analyzer in analyzers:
        experiment_analyzers['{}-{}'.format(analyzer.experiment.name,
                                            analyzer.experiment.exp_time)] = analyzer
    return experiment_analyzers
Exemplo n.º 15
0
    def _create_hdf_queue(self, raw_queue_log_tarpath, raw_queue_log_localpath, processed_queue_log_localpath):
        # haven't created HDF5 store yet; create it now
        find_bad_lines_cmd = 'grep ^.*,.*,.*,.*,.*,.*,.*,.*,.*$ {} -v -n'.format(raw_queue_log_localpath)
        badlines = run_local_command(find_bad_lines_cmd, shell=False).split('\n')
        if len(badlines) >= 1 and badlines[0] != '':
            sort_cmd = 'sort -k 2 -o {} {}'.format(raw_queue_log_localpath, raw_queue_log_localpath)
            print('Found {} bad lines:\n {}'.format(len(badlines), badlines))
        else:
            tmp_queue_filename = raw_queue_log_localpath + '.tmp'
            sort_cmd = 'sort -k 2 -o {} {} && grep ^.*,.*,.*,.*,.*,.*,.*,.*,.*$ {} > {} && mv {} {} '.format(
                        raw_queue_log_localpath, raw_queue_log_localpath, raw_queue_log_localpath, tmp_queue_filename, tmp_queue_filename, raw_queue_log_localpath)

        with untarfile(self.experiment.tarfile_localpath, raw_queue_log_tarpath, postprocess_cmd=sort_cmd) as f:
            with pd.HDFStore(processed_queue_log_localpath, mode='w') as store:
                df = pd.read_csv(f, names = ['dequeued',
                                                'time',
                                                'src',
                                                'seq',
                                                'datalen',
                                                'size',
                                                'dropped',
                                                'queued',
                                                'batch'],
                                    converters = {'seq': tohex,
                                                'src': tohex},
                                    dtype={'dequeued': bool,
                                        'time': np.uint64,
                                        'datalen': np.uint16,
                                        'size': np.uint32,
                                        'dropped':bool,
                                        'queued': np.uint16,
                                        'batch': np.uint16}, skip_blank_lines=True)
                df['seq'] = df['seq'].astype( np.uint32)
                df['src'] = df['src'].astype( np.uint16)
                #chunk['time'] = pd.to_datetime(chunk['time'], infer_datetime_format=True, unit='ns')
                df['lineno'] = df.index + 1
                df = df.set_index('time')
                df_enq = (pd
                        .get_dummies(df[(df.dequeued==0) & (df.dropped==0)]['src'])
                        .astype(np.uint8))
                df_deq = (pd
                        .get_dummies(df[df.dequeued==1]['src'])
                        .replace(1,-1)
                        .astype(np.int8))
                df_flows = (df_enq
                            .append(df_deq)
                            .sort_index()
                            .cumsum()
                            .fillna(0)
                            .astype(np.uint32)) #cumsum may return negative numbers and if we convert that to uint32, it will return large numbers 
                df = (df
                    #.reset_index()
                    .join(df_flows)
                    .sort_index()
                    .ffill())
                df.index = pd.to_datetime(df.index,
                                        infer_datetime_format=True,
                                        unit='ns')
                #df = df.set_index('time')
                store.append('df_queue',
                            df,
                            format='table',
                            data_columns=['src', 'dropped', 'dequeued'])