def process_log(logfile): ###### preprocess log print "Processing HTTP logs..." all_lines = basic.read(logfile) all_nodes = [] for line in all_lines: all_nodes.append(basic.NodeFromLog(line)) all_nodes.sort(lambda x, y: cmp(x, y), lambda x: x.start_time, False) return all_nodes
def process_log(logfile): ###### preprocess log print 'Processing HTTP logs...' all_lines = basic.read(logfile) all_nodes = [] for line in all_lines: all_nodes.append(basic.NodeFromLog(line)) all_nodes.sort(lambda x,y: cmp(x,y), lambda x: x.start_time, False) return all_nodes
from lib.myWeb import WebPage, WebObject from lib.utilities import Logger parser = argparse.ArgumentParser(description='Page reconstruction from weblog using type-based approach.') parser.add_argument('logfile', type=str, help= 'log file containing the request/response pair') args = parser.parse_args() input_file = args.logfile detected_pageurl = input_file+'.page.tmp' ###### logging this_log = './log/'+sys.argv[0].replace('.', '_')+'.log' log_h = Logger(this_log) print 'log file: %s' % this_log print 'Reading log...' all_lines = logbasic.read(input_file) print 'Processing rrp...' all_nodes = [] for line in all_lines: all_nodes.append(logbasic.NodeFromLog(line)) all_nodes.sort(lambda x,y: cmp(x,y), lambda x: x.start_time, False) all_pages = [] last_page = None for node in all_nodes: if node.is_root(): new_page = WebPage() new_page.add_obj(node, root = True) all_pages.append(new_page) last_page = new_page
parser.add_argument('logfile', type=str, help='log file containing the request/response pair') args = parser.parse_args() input_file = args.logfile detected_pageurl = input_file + '.page.tmp' print 'detected pages: %s' % detected_pageurl ###### logging this_log = './log/' + sys.argv[0].replace('.', '_') + '.log' log_h = Logger(this_log) print 'log file: %s' % this_log print 'Reading log...' all_lines = logbasic.read(input_file) print 'Processing rrp...' all_nodes = [] for line in all_lines: all_nodes.append(logbasic.NodeFromLog(line)) all_nodes.sort(lambda x, y: cmp(x, y), lambda x: x.start_time, False) print len(all_nodes) T = [i / 10.0 for i in range(2, 202, 2)] for t in T: log_h.log('########################\n') all_pages = [] last_page = None
def main(): parser = argparse.ArgumentParser(description='Page reconstruction from weblog using StreamStructure algorithm proposed by S. Ihm on IMC 2011.') parser.add_argument('-k', type=int, default = 2, help= 'T parameter') parser.add_argument('-t', type=int, default = 5, help= 'T parameter') parser.add_argument('logfile', type=str, help= 'log file containing the request/response pair') args = parser.parse_args() log_file = args.logfile detected_pageurl = log_file+'.page.tmp' K = [args.k] T = [args.t] print 'Reading log...' all_lines = basic.read(log_file) print 'Processing rrp...' all_nodes = [] for line in all_lines: all_nodes.append(basic.NodeFromLog(line)) all_nodes.sort(lambda x,y: cmp(x,y), lambda x: x.start_time, False) ###### construct trees print 'Creating graph...' new_graph = Graph() for node in all_nodes: new_graph.add_node(node) trees = new_graph.all_trees() junk_nodes = new_graph.junk_nodes # little trick: treat a tree with one node # as the invalid and add its nodes to 'junk_nodes' valid_trees = [] for tree in trees: if len(tree.nodes) > 1: valid_trees.append(tree) else: junk_nodes += tree.nodes print('valid trees: {0}, junk_nodes: {1}'.format(len(valid_trees), len(junk_nodes))) ###### cut pages K = [1] T = [i/10.0 for i in range(2, 200, 2)] for k in K: for t in T: log('#############') log('K = %d, T = %.2f' % (k, t)) all_pages = [] for tree in valid_trees: all_pages += process_tree(tree, k, t) log('Pages:%d' % len(all_pages)) all_urls = [i.root.url for i in all_pages] ofile = open(detected_pageurl, 'wb') ofile.write('\n'.join(all_urls)) ofile.close() page_gt = log_file.split('.')[0]+'.page' cmd = 'python tools/check_urls.py "{0}" "{1}"'.format(detected_pageurl, page_gt) f = Popen(cmd, shell=True, stdout=PIPE).stdout for line in f: log(line.strip(" \r\n"))
def main(): parser = argparse.ArgumentParser( description= 'Page reconstruction from weblog using StreamStructure algorithm proposed by S. Ihm on IMC 2011.' ) parser.add_argument('-k', type=int, default=2, help='T parameter') parser.add_argument('-t', type=int, default=5, help='T parameter') parser.add_argument('logfile', type=str, help='log file containing the request/response pair') args = parser.parse_args() log_file = args.logfile detected_pageurl = log_file + '.page.tmp' K = [args.k] T = [args.t] print 'Reading log...' all_lines = basic.read(log_file) print 'Processing rrp...' all_nodes = [] for line in all_lines: all_nodes.append(basic.NodeFromLog(line)) all_nodes.sort(lambda x, y: cmp(x, y), lambda x: x.start_time, False) ###### construct trees print 'Creating graph...' new_graph = Graph() for node in all_nodes: new_graph.add_node(node) trees = new_graph.all_trees() junk_nodes = new_graph.junk_nodes # little trick: treat a tree with one node # as the invalid and add its nodes to 'junk_nodes' valid_trees = [] for tree in trees: if len(tree.nodes) > 1: valid_trees.append(tree) else: junk_nodes += tree.nodes print('valid trees: {0}, junk_nodes: {1}'.format(len(valid_trees), len(junk_nodes))) ###### cut pages K = [1] T = [i / 10.0 for i in range(2, 200, 2)] for k in K: for t in T: log('#############') log('K = %d, T = %.2f' % (k, t)) all_pages = [] for tree in valid_trees: all_pages += process_tree(tree, k, t) log('Pages:%d' % len(all_pages)) all_urls = [i.root.url for i in all_pages] ofile = open(detected_pageurl, 'wb') ofile.write('\n'.join(all_urls)) ofile.close() page_gt = log_file.split('.')[0] + '.page' cmd = 'python tools/check_urls.py "{0}" "{1}"'.format( detected_pageurl, page_gt) f = Popen(cmd, shell=True, stdout=PIPE).stdout for line in f: log(line.strip(" \r\n"))