示例#1
0
def _extract_owd_pktloss(test_id='', out_dir='', replot_only='0', source_filter='',
                ts_correct='1', burst_sep='0.0', sburst='1', eburst='0',
                seek_window='16000', log_loss='0', anchor_map='', owd_midpoint='0'):
    "Extract OWD or PKTLOSS of flows"

    ifile_ext = '.dmp.gz'
    
    if log_loss == '0':
        ofile_ext = '.owds2.gz'
    else:
        ofile_ext = '.loss2.gz'

    already_done = {}
    out_files = {}
    out_groups = {}

    test_id_arr = test_id.split(';')
    if len(test_id_arr) == 0 or test_id_arr[0] == '':
        abort('Must specify test_id parameter')

    if ts_correct == '0' and log_loss == '0':
        abort('Must use ts_correct=1 when calculating OWD')

    # Initialise source filter data structure
    sfil = SourceFilter(source_filter)
    
    # EXPERIMENTAL: anchor_map="<srcip1>:<dstip1>;<srcip2>:<dstip2>;..."
    # Normally a packet's OWD is logged as having occurred at the time the
    # packet is seen in pcap file associated with <srcip> (src_extname)
    # An 'anchor_map' entry allows you to specify that a packet's OWD be
    # logged as having occurred at the time the packet from <srcip> was
    # seen in the pcap file associated with <dstip> (dst_extname)
    # This is only operates on packets between <srcip>:<dstip>, other
    # flows in the same testID are unaffected.

    anchor_map_list = {}
    if anchor_map != '':
        if replot_only == '1':
            abort("Must specify replot_only=0 in conjunction with anchor_map")
        entries = anchor_map.split(';')
        for entry in entries:
            k, v = entry.split(':')
            anchor_map_list[k] = v

    group = 1
    
    for test_id in test_id_arr:

        # first process tcpdump files (ignore router and ctl interface tcpdumps)
        tcpdump_files = get_testid_file_list('', test_id,
                                ifile_ext, 
                                'grep -v "router.dmp.gz" | grep -v "ctl.dmp.gz"')

        for tcpdump_file in tcpdump_files:
            
            # get input directory name and create result directory if necessary
            out_dirname = get_out_dir(tcpdump_file, out_dir) 
            dir_name = os.path.dirname(tcpdump_file)
            
            # get unique flows
            flows = lookup_flow_cache(tcpdump_file)
            if flows == None:
                # If not previously found in flow_cache,
                # extract and identify the tcp and udp flows contained in tcpdump_file
                flows = _list(local('zcat %s | tcpdump -nr - "tcp" | '
                                'awk \'{ if ( $2 == "IP" ) { print $3 " " $5 " tcp" } }\' | '
                                'sed "s/://" | '
                                'sed "s/\.\([0-9]*\) /,\\1 /g" | sed "s/ /,/g" | '
                                'LC_ALL=C sort -u' %
                                tcpdump_file, capture=True))
                flows += _list(local('zcat %s | tcpdump -nr - "udp" | '
                                 'awk \'{ if ( $2 == "IP" ) { print $3 " " $5 " udp" } }\' | '
                                 'sed "s/://" | '
                                 'sed "s/\.\([0-9]*\) /,\\1 /g" | sed "s/ /,/g" | '
                                 'LC_ALL=C sort -u' %
                                 tcpdump_file, capture=True))

                # Add them to the flow cache
                append_flow_cache(tcpdump_file, flows)

            # Walk through and process the flows identified in this current tcpdump_file
            
            for flow in flows:
                
                # First extract src & dst as IP addr on experiment networks
                src, src_port, dst, dst_port, proto = flow.split(',')
                
                # Map src & dst (IP addr) to testbed-specific external (control network)
                # host names from config.py ({src,dst}_extname) and internal (experiment network)
                # hostnames/addresses from config.py ({src,dst}_internal)
                src_extname, src_internal = get_address_pair_analysis(test_id, src, do_abort='0')
                dst_extname, dst_internal = get_address_pair_analysis(test_id, dst, do_abort='0')
                
                # Skip cases of random (broadcast) traffic involving an IP address for
                # which no experimental network NIC (and hence control network hostname)
                # is directly related
                if src_extname == '' or dst_extname == '':
                    continue

                # flow name
                name = src_internal + '_' + src_port + '_' + dst_internal + '_' + dst_port
                
                # test id plus flow name
                if len(test_id_arr) > 1:
                    long_name = test_id + '_' + name
                else:
                    long_name = name
                    
                if long_name not in already_done:

                    # Construct filenames for files containing final <time> <owd|loss> pairs
                    out_final = out_dirname + test_id + '_' + name + ofile_ext
                    
                    # Only embark on actual filtering/extraction if we're asked to regenerate
                    # the intermediate OWD values, or for some reason the intermediate OWD
                    # file is missing...
                    if replot_only == '0' or not os.path.isfile(out_final):
                                                
                        # Per flow/host:
                        #   Create intermediate file of timestamps + uniqString from pcap files,
                        #   THEN call adjust_timestamps to construct a version with timestamps adjusted
                        #   relative to a single reference host in the testbed
                        #   THEN use the adjusted timestamps in the subsequent owd/loss calculations.
                        
                        # To extract packets in in FORWARD direction from both src and dst pcap files,
                        # construct dpkt flow filter in form <src_ip>:<src_port>:<dst_ip>:<dst_port>
                        # (and more specifically, from src_internal:src_port to dst_internal:dst_port).

                        filter_dpkt = src_internal + ':' + src_port + ':' + dst_internal + ':' + dst_port
                        src_port_int = int(src_port)
                        dst_port_int = int(dst_port)
                        
                        # Loop across the src and dst dmp files
                        
                        tmp_fwd_out_adj = {}
                        for dmpfile_host, dirsuffix in ([src_extname,"src"],[dst_extname,"dst"]):
                            
                            # Construct the file name of the dump file that contains this
                            # flow's packets. 'src' captured at (near) the flow's source, and 'dst'
                            # captured at (near) the flow's destination.
                            dmp_file = dir_name + '/' + test_id + '_' + dmpfile_host + ifile_ext
                            print "Extracting packets for " + name + " from:" + dmp_file
                            
                            # Construct filename for intermediate "<time> <uniqueString>" output files
                            # whose timestamps will be adjusted by adjust_timestamps()
                            # before being used for owd calculations
                            # (NOTE: Due to adjust_timestamps making assumptions about out_dir parameter,
                            # we currently can't place these tmp files under /tmp)
                            tmp_fwd_out = tempfile.mktemp(suffix=test_id + '_' + name +'_fwd_out_' + dirsuffix+".gz", dir=out_dirname)
                            
                            # Extract packet id info
                            if dmp_file.endswith('.gz'):
                                f_dmp_file = gzip.open(dmp_file)
                            else:
                                f_dmp_file = open(dmp_file)
                            pcap_reader = dpkt.pcap.Reader(f_dmp_file)
                            pcap_reader.setfilter(filter_dpkt)
                            #pcap_reader.setfilter('')
                            
                            # Create a compressed temporary intermediate file
                            f_tmp_fwd_out = gzip.open(tmp_fwd_out,'wb',1)
                            
                            # Walk across every packet in this pcap file
                            for ts, pkt in pcap_reader:
                                # get pointer to ethernet layer and check that we have IP
                                eth = dpkt.ethernet.Ethernet(pkt)
                                if eth.type != dpkt.ethernet.ETH_TYPE_IP:
                                    continue

                                # get pointer to IP layer
                                ip_pkt = eth.data

                                # ignore if src or dst IP not the ones specified in filter
                                if socket.inet_ntoa(ip_pkt.src) != src_internal or \
                                    socket.inet_ntoa(ip_pkt.dst) != dst_internal:
                                        continue

                                # ignore if UDP/TCP src or dst ports not the ones specified in filter
                                # get pointer to payload
                                if type(ip_pkt.data) == dpkt.udp.UDP:
                                    udp_frame = ip_pkt.data
                                    if udp_frame.sport != src_port_int or udp_frame.dport != dst_port_int:
                                        continue
                                    # Add IP ID field to the payload to ensure
                                    # at least something semi-unique is hashed
                                    # if UDP payload is invariant
                                    payload = str(ip_pkt.id) +udp_frame.data
                                    
                                elif type(ip_pkt.data) == dpkt.tcp.TCP:
                                    tcp_frame = ip_pkt.data
                                    if tcp_frame.sport != src_port_int or tcp_frame.dport != dst_port_int:
                                        continue
                                    # Use IP ID field, TCP Sequence number and ACK number to
                                    # construct a mostly unique string within context of this flow
                                    payload = str(ip_pkt.id) + str(tcp_frame.seq) + str(tcp_frame.ack)
                                else:
                                    continue
                                
                                # Write <timestamp> <crc32 hash of uniqueString bytes>
                                # (hashing here eliminates any later problems parsing payloads
                                # containing null bytes)

                                f_tmp_fwd_out.write("%f %s\n" % (ts,zlib.crc32(payload)))

                            f_tmp_fwd_out.close()
                            f_dmp_file.close()
                            
                            # Apply timestamp corrections to the data thus extracted, prior to
                            # calculating OWDs. Correction is MANDATORY otherwise the
                            # 'calculated' OWDs are essentially useless.
                            tmp_fwd_out_adj[dirsuffix] = adjust_timestamps(test_id, tmp_fwd_out, dmpfile_host, ' ', out_dir)
                            
                            # Remove pre-adjustment files.
                            os.remove(tmp_fwd_out)
                            
                        # Now we have unique packet hashes seen at both src and dst locations,
                        # and timestamps have been adjusted for clockoffsets.
                        
                        # Begin calculating OWD or identifying when packet losses occurred
                        
                        # Read into memory the <adjusted_timestamp> <uniqString> datasets captured
                        # at dst (2nd place packet seen, "destination"). The src is (1st place packet
                        # seen, "source")
                        
                        dst_data_time=list()
                        dst_data_uniqString=list()
                        for line in gzip.open(tmp_fwd_out_adj["dst"]).read().splitlines():
                            sline = line.split(" ")
                            dst_data_time.append(float(sline[0]))
                            dst_data_uniqString.append(sline[1])

                        # Walk through tmp_fwd_out_adj["src"] looking for matches to packets
                        # in dst_data_uniqString, and write <time> <owd|loss> pairs in plain
                        # ASCII to out_final
                        
                        # To limit potential duplicate matches to packets received forward
                        # in time from previous match in dst_data_uniqString, maintain
                        # index next_j pointing to next row in dst_data_uniqString to start
                        # matching next packet from tmp_fwd_out_adj["src"]
                        next_j = 0
                        last_j = len(dst_data_uniqString)-1

                        # As a speed-up hack, assume match in dst_data_uniqString is
                        # within sk_window entries of next_j (saves searching all the
                        # way to the end of dst_data_uniqString when seeking a lost packet)
                        # Keeping seek_window in the low 1000s also minimises chances of
                        # duplicate matches.
                        if seek_window != '':
                            sk_window = int(seek_window)
                        else:
                            sk_window = last_j
                        
                        # Create gzipped output file (rough experiments showed over reduction
                        # in on-disk file size easily 100s of K down to 10s of K).
                        # R automagically reads gzipped data files, so no changes required
                        # to subsequent analyse_* plotting scripts.
                        f = gzip.open(out_final, 'w')
                        
                        cumulative_loss = 0
                        
                        # Decide whether to use timestamp at src or dst for OWD
                        # (Default to src, unless anchor_map indicates using dst for
                        # this particular traffic pattern)
                        anchor = 0 # Default print timestamp at src
                        if log_loss == '0': # Only relevant for OWD calculations
                            if src_internal in anchor_map_list.keys():
                                #print "*** Found " + src_internal + " in anchor_map"
                                if anchor_map_list[src_internal] == dst_internal:
                                    # Only relevant if the map relates to both src_internal and dst_internal
                                    #print "*** " + src_internal + " points to " + anchor_map_list[src_internal] + " in anchor_map"
                                    anchor = 1 # Print timestamp at dst
                                        
                        for line in gzip.open(tmp_fwd_out_adj["src"]).read().splitlines():
                            i = line.split(" ")
                            try:
                                # The following search will raise a 'ValueError' exception if i[1] does not occur in dst_data_uniqString[next_j:]
                                j = dst_data_uniqString[next_j:min((next_j+sk_window),last_j+1)].index(i[1])
                                
                                if log_loss == '0':
                                    # OWD is diff between i[0] and dst_data_time[next_j+j]
                                    ts = float(i[0])
                                    owd = dst_data_time[next_j+j]-float(i[0])
                                    # If required, print event as occuring at dst timestamp rather than src timestamp
                                    if anchor:
                                        ts = dst_data_time[next_j+j]
                                    # If we want to imply the OWD "existed" at some mid-point
                                    # between pkt seen at src and seen at dst
                                    if owd_midpoint == '1':
                                        ts += owd/2                                    
                                    f.write('%f %f\n' % (ts, owd))
                                    
                                if log_loss == '1':
                                    # No lost packet, emit "0"
                                    f.write('%s 0\n' % (i[0]))
                                if log_loss == '2':
                                    # No lost packet, emit previous cumulative count
                                    f.write('%s %i\n' % (i[0], cumulative_loss))
                                    
                                next_j = min(next_j+j+1,last_j)
                                
                            except ValueError:
                                # No match means a packet loss
                                if log_loss == '1':
                                    # Single loss event, emit "1"
                                    f.write('%s 1\n' % (i[0]))
                                if log_loss == '2':
                                    # Single loss event, increment cumulative count, emit cumulative count
                                    cumulative_loss += 1
                                    f.write('%s %i\n' % (i[0], cumulative_loss))
                                pass
                                                    
                        f.close()
                        dst_data_time=[]
                        dst_data_uniqString=[]
                        
                        # Clean up temporary post-adjustment files
                        os.remove(tmp_fwd_out_adj["src"])
                        os.remove(tmp_fwd_out_adj["dst"])
                        
                    already_done[long_name] = 1
                    
                    if sfil.is_in(name):
                        (out_files, 
                        out_groups) = select_bursts(long_name, group, out_final, burst_sep, sburst, eburst,
                                    out_files, out_groups)
                         
        group += 1

    return (test_id_arr, out_files, out_groups)
示例#2
0
def get_testid_file_list(file_list_fname='',
                         test_id='',
                         file_ext='',
                         pipe_cmd='',
                         search_dir='.',
                         no_abort=False):

    file_list = []

    # if search dir is not specified try to find it in cache
    if search_dir == '.':
        search_dir = lookup_dir_cache(test_id)

    if file_list_fname == '':
        # read from test_id list specified, this always overrules list in file if
        # also specified

        test_id_arr = test_id.split(';')

        if len(test_id_arr) == 0 or test_id_arr[0] == '':
            abort('Must specify test_id parameter')

        if pipe_cmd != '':
            pipe_cmd = ' | ' + pipe_cmd

        for test_id in test_id_arr:
            _files = _list(
                local(
                    'find -L %s -name "%s*%s" -print | sed -e "s/^\.\///"%s' %
                    (search_dir, test_id, file_ext, pipe_cmd),
                    capture=True))

            _files = filter_duplicates(_files)

            if search_dir == '.' and len(_files) > 0:
                append_dir_cache(test_id, os.path.dirname(_files[0]))

            file_list += _files
    else:
        # read list of test ids from file

        try:
            lines = []
            with open(file_list_fname) as f:
                lines = f.readlines()
            for fname in lines:
                fname = fname.rstrip()
                _files = _list(
                    local('find -L %s -name "%s" -print | sed -e "s/^\.\///"' %
                          (search_dir, fname),
                          capture=True))

                _files = filter_duplicates(_files)

                if search_dir == '.' and len(_files) > 0:
                    append_dir_cache(test_id, os.path.dirname(_files[0]))

                file_list += _files

        except IOError:
            abort('Cannot open experiment list file %s' % file_list_fname)

    if not no_abort and len(file_list) == 0:
        abort('Cannot find any matching data files.\n'
              'Remove outdated teacup_dir_cache.txt if files were moved.')

    return file_list
示例#3
0
def get_testid_file_list(file_list_fname='', test_id='', file_ext='', pipe_cmd='',
                         search_dir='.', no_abort=False):

    file_list = []

    # if search dir is not specified try to find it in cache
    if search_dir == '.':
        search_dir = lookup_dir_cache(test_id)

    if file_list_fname == '':
        # read from test_id list specified, this always overrules list in file if
        # also specified

        test_id_arr = test_id.split(';')

        if len(test_id_arr) == 0 or test_id_arr[0] == '':
            abort('Must specify test_id parameter')

        if pipe_cmd != '':
            pipe_cmd = ' | ' + pipe_cmd

        for test_id in test_id_arr:
            _files = _list(
                local(
                    'find -L %s -name "%s*%s" -print | sed -e "s/^\.\///"%s' %
                    (search_dir, test_id, file_ext, pipe_cmd),
                    capture=True))

            _files = filter_duplicates(_files)
 
            if search_dir == '.' and len(_files) > 0:
                append_dir_cache(test_id, os.path.dirname(_files[0]))

            file_list += _files
    else:
        # read list of test ids from file 

        try:
            lines = []
            with open(file_list_fname) as f:
                lines = f.readlines()
            for fname in lines:
                fname = fname.rstrip()
                _files = _list(
                    local(
                        'find -L %s -name "%s" -print | sed -e "s/^\.\///"' %
                        (search_dir, fname),
                        capture=True))

                _files = filter_duplicates(_files)

                if search_dir == '.' and len(_files) > 0:
                    append_dir_cache(test_id, os.path.dirname(_files[0]))

                file_list += _files

        except IOError:
            abort('Cannot open experiment list file %s' % file_list_fname)

    if not no_abort and len(file_list) == 0:
        abort('Cannot find any matching data files.\n'
              'Remove outdated teacup_dir_cache.txt if files were moved.') 

    return file_list