def test_ug_3(self): min = LogLocation(0, datetime(datetime.now().year, 2, 13, 18, 31, 30), LogLocation.TOO_LOW, LogLocation.TOO_LOW) max = LogLocation(3770000, datetime(datetime.now().year, 2, 14, 7, 7, 39), LogLocation.TOO_HIGH, LogLocation.TOO_HIGH) guesses = [min, max] new = LogLocation(3779999999, datetime(datetime.now().year, 2, 16, 22, 26, 26), LogLocation.TOO_HIGH, LogLocation.TOO_HIGH) answer = [min, max] # no update, too high tgrep.update_guess(new, guesses) self.assertEquals(answer, guesses)
def test_ug_4(self): min = LogLocation(0, datetime(datetime.now().year, 2, 13, 18, 31, 30), LogLocation.TOO_LOW, LogLocation.TOO_LOW) max = LogLocation(3770000, datetime(datetime.now().year, 2, 14, 7, 7, 39), LogLocation.TOO_HIGH, LogLocation.TOO_HIGH) guesses = [min, max] new = LogLocation(12345, datetime(datetime.now().year, 2, 13, 19, 0, 0), LogLocation.TOO_HIGH, LogLocation.TOO_LOW) answer = [min, max] # no update, in range tgrep.update_guess(new, guesses) self.assertEquals(answer, guesses)
def test_ug_2(self): min = LogLocation(0, datetime(datetime.now().year, 2, 13, 18, 31, 30), LogLocation.TOO_LOW, LogLocation.TOO_LOW) max = LogLocation(3770000, datetime(datetime.now().year, 2, 14, 7, 7, 39), LogLocation.TOO_HIGH, LogLocation.TOO_HIGH) guesses = [min, max] new = LogLocation(2220000, datetime(datetime.now().year, 2, 14, 6, 6, 6), LogLocation.TOO_HIGH, LogLocation.TOO_HIGH) answer = [min, new] # update high tgrep.update_guess(new, guesses) self.assertEquals(answer, guesses)
def pessismistic_binary_search(log, seek_loc, times, results, arr): """Reads only a little and checks only the first timestamp. Better when it's way off base.""" log.seek(seek_loc) arr[0] += 1 chunk = log.read(MORE_THAN_ONE_LINE) arr[1] += 1 # find the nearest newline so we can find the timestamp nl_index = chunk.find("\n") if nl_index == -1: results.put(None) return # //! better error case? nl_index += 1 # get past the newline # find the first bit of the line, e.g. "Feb 14 05:52:12 web0" # split it on the whitespace, e.g. ["Feb", "14", "05:52:12", "web0"] # join the first three back together again with ' ' as the seperator # parse the thing! # //! need to research a better (faster?) way to do this time = parse_time(' '.join(chunk[nl_index:nl_index + 20].split()[:3])) #//! magic 20 result = LogLocation( seek_loc, time, # from before, no change logloc.time_cmp(time, times[0]), # how it compares, min logloc.time_cmp(time, times[1])) # how it compares, max results.put(result)
def test_bsg(self): min = LogLocation(0, datetime(datetime.now().year, 2, 13, 23, 33, 11), LogLocation.MATCH, LogLocation.MATCH) max = LogLocation(10, datetime(datetime.now().year, 2, 13, 23, 33, 15), LogLocation.TOO_HIGH, LogLocation.TOO_HIGH) bounds = [min, max] foo = None # ignored anyways self.assertEquals( 5, tgrep.binary_search_guess(bounds, foo)) bounds[1].set_loc(100) self.assertEquals( 50, tgrep.binary_search_guess(bounds, foo)) bounds[1].set_loc(1000) self.assertEquals(500, tgrep.binary_search_guess(bounds, foo)) bounds[0].set_loc(337) bounds[1].set_loc(222) self.assertEquals(279, tgrep.binary_search_guess(bounds, foo)) # actually is 279.5, testing round bounds[0].set_loc(0) bounds[1].set_loc(993837478) self.assertEquals(496918739, tgrep.binary_search_guess(bounds, foo))
def test_prn_m_2(self): global expected_log2 # zero bytes min = LogLocation(1507623, datetime(datetime.now().year, 2, 13, 23, 33, 3), LogLocation.TOO_HIGH, LogLocation.TOO_LOW) min.set_is_min(True) max = LogLocation(1507623, datetime(datetime.now().year, 2, 13, 23, 33, 3), LogLocation.TOO_HIGH, LogLocation.TOO_HIGH) # same log, basically max.set_is_min(True) bounds = [min, max] tgrep.print_log_lines(self.mlog_file, bounds, self.log_entries) self.assertEquals(expected_log2, self.log_entries.getvalue())
def test_prn_m_0(self): global expected_log0 # Feb 13 23:33:11 (one log line) # [[1508000, 2011-02-13 23:33:11, 0, 0, True, False], [1508377, 2011-02-13 23:33:15, 1, 1, False, True]] min = LogLocation(1508000, datetime(datetime.now().year, 2, 13, 23, 33, 11), LogLocation.MATCH, LogLocation.MATCH) min.set_is_min(True) max = LogLocation(1508377, datetime(datetime.now().year, 2, 13, 23, 33, 15), LogLocation.TOO_HIGH, LogLocation.TOO_HIGH) max.set_is_min(True) bounds = [min, max] tgrep.print_log_lines(self.mlog_file, bounds, self.log_entries) self.assertEquals(expected_log0, self.log_entries.getvalue())
def test_prn_m_6(self): global expected_log6 # Feb 13 18:30:30 (Start of File, chunk, no exact matches) # [[0, 2011-02-13 18:31:30, 1, -1, True, False], [2639, 2011-02-13 18:32:08, 1, 1, False, True]] min = LogLocation(0, datetime(datetime.now().year, 2, 13, 18, 31, 30), LogLocation.TOO_HIGH, LogLocation.TOO_LOW) min.set_is_min(True) max = LogLocation(2639, datetime(datetime.now().year, 2, 13, 18, 32, 8), LogLocation.TOO_HIGH, LogLocation.TOO_HIGH) max.set_is_min(True) bounds = [min, max] tgrep.print_log_lines(self.mlog_file, bounds, self.log_entries) self.assertEquals(expected_log6, self.log_entries.getvalue())
def test_prn_m_5(self): global expected_log5 # Feb 13 18:31:30 (Start of File, exactly one line) # [[0, 2011-02-13 18:31:30, 0, 0, True, False], [377, 2011-02-13 18:31:36, 1, 1, False, True]] min = LogLocation(0, datetime(datetime.now().year, 2, 13, 18, 31, 30), LogLocation.TOO_LOW, LogLocation.TOO_LOW) min.set_is_min(True) max = LogLocation(377, datetime(datetime.now().year, 2, 13, 18, 31, 36), LogLocation.TOO_HIGH, LogLocation.TOO_HIGH) max.set_is_min(True) bounds = [min, max] tgrep.print_log_lines(self.mlog_file, bounds, self.log_entries) self.assertEquals(expected_log5, self.log_entries.getvalue())
def test_prn_m_4(self): global expected_log4 # Feb 14 07:07:39 (End of File, chunk) # [[3765853, 2011-02-14 07:07:01, 1, -1, True, False], [3770000, 2011-02-14 07:07:39, 1, 1, False, True]] min = LogLocation(3765853, datetime(datetime.now().year, 2, 14, 7, 7, 1), LogLocation.TOO_HIGH, LogLocation.TOO_LOW) min.set_is_min(True) max = LogLocation(3770000, datetime(datetime.now().year, 2, 14, 7, 7, 39), LogLocation.TOO_HIGH, LogLocation.TOO_HIGH) max.set_is_min(True) bounds = [min, max] tgrep.print_log_lines(self.mlog_file, bounds, self.log_entries) self.assertEquals(expected_log4, self.log_entries.getvalue())
def test_prn_m_1(self): global expected_log1 # Feb 13 23:33 (whole minute) # [[1507623, 2011-02-13 23:33:03, 1, -1, True, False], [1512524, 2011-02-13 23:34:03, 1, 1, False, True]] min = LogLocation(1507623, datetime(datetime.now().year, 2, 13, 23, 33, 3), LogLocation.TOO_HIGH, LogLocation.TOO_LOW) min.set_is_min(True) max = LogLocation(1512524, datetime(datetime.now().year, 2, 13, 23, 34, 3), LogLocation.TOO_HIGH, LogLocation.TOO_HIGH) max.set_is_min(True) bounds = [min, max] tgrep.print_log_lines(self.mlog_file, bounds, self.log_entries) self.assertEquals(expected_log1, self.log_entries.getvalue())
def test_ug_6(self): min = LogLocation(0, datetime(datetime.now().year, 2, 13, 18, 31, 30), LogLocation.TOO_LOW, LogLocation.TOO_LOW) max = LogLocation(3770000, datetime(datetime.now().year, 2, 14, 7, 7, 39), LogLocation.TOO_HIGH, LogLocation.TOO_HIGH) guesses = [min, max] new = LogLocation(1234, datetime(datetime.now().year, 2, 13, 19, 0, 0), LogLocation.TOO_HIGH, LogLocation.TOO_HIGH) new.set_is_max(True) answer = [min, new] # update, is_max tgrep.update_guess(new, guesses) self.assertEquals(answer, guesses)
def test_ug_8(self): #//! impl in update_guess return min = LogLocation(0, datetime(datetime.now().year, 2, 13, 18, 31, 30), LogLocation.TOO_HIGH, LogLocation.TOO_LOW) max = LogLocation(3770000, datetime(datetime.now().year, 2, 14, 7, 7, 39), LogLocation.TOO_HIGH, LogLocation.TOO_HIGH) max.set_is_max(True) guesses = [min, max] new = LogLocation(1234, datetime(datetime.now().year, 2, 13, 19, 0, 0), LogLocation.TOO_HIGH, LogLocation.TOO_HIGH) answer = [min, max] # ignore, is_max already set for max tgrep.update_guess(new, guesses) self.assertEquals(answer, guesses)
def optimistic_edge_search(log, guess, times, results): """Reads a chunk and checks whole thing for timestamps. Better when it's really close.""" global stats seek_loc = guess._seek_loc if guess.get_minmax() == LogLocation.OUT_OF_RANGE_HIGH: # we're looking for the max and we're above it, so read from a chunk away to here. # print "looking high" # print "%d %d" % (guess._seek_loc, guess._seek_loc - EDGE_SWEEP_CHUNK_SIZE) seek_loc -= EDGE_SWEEP_CHUNK_SIZE seek_loc = 0 if seek_loc < 0 else seek_loc log.seek(seek_loc) stats['seeks'] += 1 else: # we're looking for the min and we're below it, so read starting here. log.seek(seek_loc) stats['seeks'] += 1 chunk = log.read(EDGE_SWEEP_CHUNK_SIZE) stats['reads'] += 1 prev_minmax = guess.get_minmax() result = LogLocation( 0, datetime.min, LogLocation.TOO_LOW, LogLocation.TOO_HIGH) # an invalid result to start with chunk_loc = 0 end_loc = chunk.rfind('\n') while chunk_loc < end_loc: # print "%d / %d" % (seek_loc + chunk_loc, seek_loc + end_loc) try: # find the nearest newline so we can find the timestamp nl_index = chunk[chunk_loc:].find('\n') if nl_index == -1: # print "can't find new line" break # Can't find a newline; we're done. nl_index += 1 # get past the newline # find the first bit of the line, e.g. "Feb 14 05:52:12 web0" # split it on the whitespace, e.g. ["Feb", "14", "05:52:12", "web0"] # join the first three back together again with ' ' as the seperator # parse the thing! # //! need to research a better (faster?) way to do this time = parse_time(' '.join( chunk[chunk_loc + nl_index:chunk_loc + nl_index + 20].split()[:3])) #//! magic 20 chunk_loc += nl_index result._seek_loc = seek_loc + chunk_loc result._timestamp = time # compare to desired to see if it's a better max if time > times[1]: result._relation_to_desired_min = LogLocation.TOO_HIGH result._relation_to_desired_max = LogLocation.TOO_HIGH # check to see if it's the edge if prev_minmax[1] != LogLocation.TOO_HIGH: # We passed out of range. This loc is where we want to /stop/ reading. Save it! result.set_is_max(True) # print "short circuit" break # Can short-circuit if find a max, since we're reading buff beginning-to-end. elif time == times[1]: # do nothing for now about data, may optimize to save off data later. result._relation_to_desired_max = LogLocation.MATCH else: # time < times[1] result._relation_to_desired_max = LogLocation.TOO_LOW # and now the min if time < times[0]: result._relation_to_desired_min = LogLocation.TOO_LOW elif time == times[0]: # do nothing for now about data, may optimize to save off data later. result._relation_to_desired_min = LogLocation.MATCH else: # time > times[0] result._relation_to_desired_min = LogLocation.TOO_HIGH # print result.get_minmax() # see if we got the min edge (max was checked above) p = prev_minmax[0] r = result._relation_to_desired_min # print "p: %d, r: %d" % (p,r) if (prev_minmax == LogLocation.OUT_OF_RANGE_LOW) and ( result.get_minmax() == LogLocation.OUT_OF_RANGE_HIGH): pass # //! No matches! Tell the dude and quit! elif (p == LogLocation.TOO_LOW) and (r != LogLocation.TOO_LOW): # We passed into our range via min. This is one. result.set_is_min(True) break prev_minmax = result.get_minmax() except ValueError: # not a time string found print "time parse error" pass # we're ok with occasional non-time string lines. Might start the read in the middle of a line, for example. finally: chunk_loc += nl_index # print result results.append(result)
def wide_sweep(log, filesize, times, num_procs): # binary search, with friends! # for the global counters... global stats arr = Array('i', [stats['seeks'], stats['reads']]) results = Queue() nearest_guesses = [ LogLocation(0, datetime(1999, 1, 1, 00, 00, 00), LogLocation.TOO_LOW, LogLocation.TOO_LOW), # //! make y1k safe LogLocation(filesize, datetime.now().replace(year=3000), LogLocation.TOO_HIGH, LogLocation.TOO_HIGH) ] # //! make y3k safe prev_focus, seek_guesses = binary_search_guess( nearest_guesses[0]._seek_loc, nearest_guesses[1]._seek_loc, num_procs) hits = [] found = False # //! binary search until focal point of search is same while not found: stats['wide_sweep_loops'] += 1 children = [] for seek_loc in seek_guesses: p = Process(target=pessismistic_binary_search, args=(log, seek_loc, times, results, arr)) p.start() children.append(p) for child in children: child.join( ) # wait for all procs to finish before calculating next step focus = -1 while not results.empty(): result = results.get() # //! need to check guess error state if LogLocation.MATCH in result.get_minmax(): print "found it!" # //! hits.append(result) found = True # print result update_guess(result, nearest_guesses) # updates nearest_guesses in place # print nearest_guesses # print seek_guesses focus, seek_guesses = binary_search_guess(nearest_guesses[0]._seek_loc, nearest_guesses[1]._seek_loc, num_procs) # print seek_guesses if focus == prev_focus: print "steady state!" # //! found = True break elif (nearest_guesses[1]._seek_loc - nearest_guesses[0]._seek_loc) < WIDE_SWEEP_CLOSE_ENOUGH: print "close enough!" # //! found = True break prev_focus = focus print hits print nearest_guesses print(nearest_guesses[1]._seek_loc - nearest_guesses[0]._seek_loc, nearest_guesses[0]._seek_loc, nearest_guesses[1]._seek_loc) stats['seeks'] = arr[0] stats['reads'] = arr[1] return hits, nearest_guesses
def optimistic_edge_search(log, guess, times, results): """Reads a chunk and checks whole thing for timestamps. Better when it's really close.""" global stats seek_loc = guess._seek_loc if guess.get_minmax() == LogLocation.OUT_OF_RANGE_HIGH: # we're looking for the max and we're above it, so read from a chunk away to here. # print "looking high" # print "%d %d" % (guess._seek_loc, guess._seek_loc - EDGE_SWEEP_CHUNK_SIZE) seek_loc -= EDGE_SWEEP_CHUNK_SIZE seek_loc = 0 if seek_loc < 0 else seek_loc log.seek(seek_loc) stats['seeks'] += 1 else: # we're looking for the min and we're below it, so read starting here. log.seek(seek_loc) stats['seeks'] += 1 chunk = log.read(EDGE_SWEEP_CHUNK_SIZE) stats['reads'] += 1 prev_minmax = guess.get_minmax() result = LogLocation(0, datetime.min, LogLocation.TOO_LOW, LogLocation.TOO_HIGH) # an invalid result to start with chunk_loc = 0 end_loc = chunk.rfind('\n') while chunk_loc < end_loc: # print "%d / %d" % (seek_loc + chunk_loc, seek_loc + end_loc) try: # find the nearest newline so we can find the timestamp nl_index = chunk[chunk_loc:].find('\n') if nl_index == -1: # print "can't find new line" break # Can't find a newline; we're done. nl_index += 1 # get past the newline # find the first bit of the line, e.g. "Feb 14 05:52:12 web0" # split it on the whitespace, e.g. ["Feb", "14", "05:52:12", "web0"] # join the first three back together again with ' ' as the seperator # parse the thing! # //! need to research a better (faster?) way to do this time = parse_time(' '.join(chunk[chunk_loc + nl_index : chunk_loc + nl_index + 20].split()[:3])) #//! magic 20 chunk_loc += nl_index result._seek_loc = seek_loc + chunk_loc result._timestamp = time # compare to desired to see if it's a better max if time > times[1]: result._relation_to_desired_min = LogLocation.TOO_HIGH result._relation_to_desired_max = LogLocation.TOO_HIGH # check to see if it's the edge if prev_minmax[1] != LogLocation.TOO_HIGH: # We passed out of range. This loc is where we want to /stop/ reading. Save it! result.set_is_max(True) # print "short circuit" break # Can short-circuit if find a max, since we're reading buff beginning-to-end. elif time == times[1]: # do nothing for now about data, may optimize to save off data later. result._relation_to_desired_max = LogLocation.MATCH else: # time < times[1] result._relation_to_desired_max = LogLocation.TOO_LOW # and now the min if time < times[0]: result._relation_to_desired_min = LogLocation.TOO_LOW elif time == times[0]: # do nothing for now about data, may optimize to save off data later. result._relation_to_desired_min = LogLocation.MATCH else: # time > times[0] result._relation_to_desired_min = LogLocation.TOO_HIGH # print result.get_minmax() # see if we got the min edge (max was checked above) p = prev_minmax[0] r = result._relation_to_desired_min # print "p: %d, r: %d" % (p,r) if (prev_minmax == LogLocation.OUT_OF_RANGE_LOW) and (result.get_minmax() == LogLocation.OUT_OF_RANGE_HIGH): pass # //! No matches! Tell the dude and quit! elif (p == LogLocation.TOO_LOW) and (r != LogLocation.TOO_LOW): # We passed into our range via min. This is one. result.set_is_min(True) break prev_minmax = result.get_minmax() except ValueError: # not a time string found print "time parse error" pass # we're ok with occasional non-time string lines. Might start the read in the middle of a line, for example. finally: chunk_loc += nl_index # print result results.append(result)