def add_node(self, new_node): """ Create the referrer trees depending on nodes' referrer relationships. """ # Search for corresponding subgraph if new_node.user_ip is None: print 'Source IP is lost in request/response pair.' exit(-1) subgraph = self.get_subgraph(new_node.user_ip) if subgraph == None: subgraph = SubGraph(new_node.user_ip) self.subgraphs.append(subgraph) try: # Start linking if new_node.user_agent is not None: if new_node.user_agent in subgraph.ua_trees_d.keys(): linked_flag = False for tree in subgraph.ua_trees_d[new_node.user_agent][::-1]: # Session idle time of 15 minutes used if new_node.start_time - tree.nodes[ -1].start_time <= datetime.timedelta( minutes=15): # Find its predecessor pred_id = None if new_node.referrer: for item in tree.nodes[::-1]: #if utilities.cmp_url(new_node.referrer, item.url, 'loose'): if utilities.cmp_url( new_node.referrer, item.url, 'strict'): pred_id = item.identifier break if pred_id != None: # Predecessor found... tree.add_node(new_node, pred_id) linked_flag = True break # After all the trees are checked: if not linked_flag: raise NewTreeNeeded else: # new user agent index and new tree raise NewTreeNeeded except NewTreeNeeded: if new_node.is_root(): if int(new_node.status) == 200: new_tree = mod_tree.Tree() new_tree.add_node(new_node, parent=None) # Update the graph try: subgraph.ua_trees_d[new_node.user_agent].append( new_tree) except: subgraph.ua_trees_d[new_node.user_agent] = [new_tree] else: self.junk_nodes.append(new_node)
def add_node(self, new_node): """ Create the referrer trees depending on nodes' referrer relationships. """ # Search for corresponding subgraph if new_node.user_ip is None: print 'Source IP is lost in request/response pair.' exit(-1) subgraph = self.get_subgraph(new_node.user_ip) if subgraph == None: subgraph = SubGraph(new_node.user_ip) self.subgraphs.append(subgraph) try: # Start linking if new_node.user_agent is not None: if new_node.user_agent in subgraph.ua_trees_d.keys(): linked_flag = False for tree in subgraph.ua_trees_d[new_node.user_agent][::-1]: # Session idle time of 15 minutes used if new_node.start_time - tree.nodes[-1].start_time <= datetime.timedelta(minutes = 15): # Find its predecessor pred_id = None if new_node.referrer: for item in tree.nodes[::-1]: #if utilities.cmp_url(new_node.referrer, item.url, 'loose'): if utilities.cmp_url(new_node.referrer, item.url, 'strict'): pred_id = item.identifier break if pred_id != None: # Predecessor found... tree.add_node(new_node, pred_id) linked_flag = True break # After all the trees are checked: if not linked_flag: raise NewTreeNeeded else: # new user agent index and new tree raise NewTreeNeeded except NewTreeNeeded: if new_node.is_root(): if int(new_node.status) == 200: new_tree = mod_tree.Tree() new_tree.add_node(new_node, parent=None) # Update the graph try: subgraph.ua_trees_d[new_node.user_agent].append(new_tree) except: subgraph.ua_trees_d[new_node.user_agent] = [new_tree] else: self.junk_nodes.append(new_node)
def parse_pages_har(harfolder): print 'Processing har files...' # Processing all HAR file under the folder all_real_pages = [] all_objects = [] for root, dirs, files in os.walk(harfolder): for file in files: suffix = file.rsplit('.', 1)[1] if suffix != 'har': continue inputfile = os.path.join(root, file) # Open HAR file har_log = json.load(codecs.open(inputfile, 'rb', 'utf-8'))['log'] har_pages = har_log['pages'] har_objects = har_log['entries'] # Extract web objects and order them in time allnodes = [] for i in har_objects: new_node = NodeFromHar(i) # new node allnodes.append(new_node) allnodes.sort(lambda x, y: cmp(x, y), lambda x: x.start_time, False) all_objects += allnodes # Find valid trees from raw web objects trees = [] junk_nodes = [ ] # who can't find referrer and is not the type of root tot = 0 for new_node in allnodes: tot += 1 try: # Start linking linked_flag = False for tree in trees: pred_id = None if new_node.referrer: for item in tree.nodes[::-1]: if utilities.cmp_url(new_node.referrer, item.url, 'strict'): pred_id = item.identifier break if pred_id: # Predecessor found... tree.add_node(new_node, pred_id) linked_flag = True break # After all the trees are checked: if not linked_flag: raise NewTreeNeeded except NewTreeNeeded: if new_node.is_root(): if new_node.status == 200: new_tree = mod_tree.Tree() # new tree new_tree.add_node(new_node, None) linked_flag = True trees.append(new_tree) else: junk_nodes.append(new_node) # Sort trees in the order of ascending time trees.sort(lambda x, y: cmp(x, y), lambda x: x[x.root].start_time, False) # little trick: treat a tree with one node as the invalid # and add its nodes to 'junk_nodes' valid_trees = [] for tree in trees: if len(tree.nodes) > 1: valid_trees.append(tree) else: junk_nodes += tree.nodes #log('{0} {1} {2}'.format(tot, len(junk_nodes), input)) # find real page(s) from valid trees. real_pages = [] last = None for tree in valid_trees: # one tree -> one page new_page = WebPage() # Treat the tree with more than # one nodes as a valid tree new_page.root = tree[tree.root] new_page.objs = tree.nodes real_pages.append(new_page) last = tree # Options: process junk web objects: # Add the each object to the nearest # web page of 'real_pages' junk2 = 0 for node in junk_nodes: found_flag = False for page in real_pages[::-1]: if cmp(page.root.start_time, node.start_time) < 0: found_flag = True break if found_flag: page.objs.append(node) else: junk2 += 1 all_real_pages += real_pages[0:1] # little trick: with foreknowledge, the first page is the real page # so we obtain the first one and drop the others as invalid ones. return all_real_pages, all_objects
def parse_pages_har(harfolder): print 'Processing har files...' # Processing all HAR file under the folder all_real_pages = [] all_objects = [] for root, dirs, files in os.walk(harfolder): for file in files: suffix = file.rsplit('.', 1)[1] if suffix != 'har': continue inputfile = os.path.join(root, file) # Open HAR file har_log = json.load(codecs.open(inputfile, 'rb', 'utf-8'))['log'] har_pages = har_log['pages'] har_objects = har_log['entries'] # Extract web objects and order them in time allnodes = [] for i in har_objects: new_node = NodeFromHar(i) # new node allnodes.append(new_node) allnodes.sort(lambda x,y: cmp(x, y), lambda x: x.start_time, False) all_objects += allnodes # Find valid trees from raw web objects trees = [] junk_nodes = [] # who can't find referrer and is not the type of root tot = 0 for new_node in allnodes: tot += 1 try: # Start linking linked_flag = False for tree in trees: pred_id = None if new_node.referrer: for item in tree.nodes[::-1]: if utilities.cmp_url(new_node.referrer, item.url, 'strict'): pred_id = item.identifier break if pred_id: # Predecessor found... tree.add_node(new_node, pred_id) linked_flag = True break # After all the trees are checked: if not linked_flag: raise NewTreeNeeded except NewTreeNeeded: if new_node.is_root(): if new_node.status == 200: new_tree = mod_tree.Tree() # new tree new_tree.add_node(new_node, None) linked_flag = True trees.append(new_tree) else: junk_nodes.append(new_node) # Sort trees in the order of ascending time trees.sort(lambda x,y: cmp(x,y), lambda x: x[x.root].start_time, False) # little trick: treat a tree with one node as the invalid # and add its nodes to 'junk_nodes' valid_trees = [] for tree in trees: if len(tree.nodes) > 1: valid_trees.append(tree) else: junk_nodes += tree.nodes #log('{0} {1} {2}'.format(tot, len(junk_nodes), input)) # find real page(s) from valid trees. real_pages = [] last = None for tree in valid_trees: # one tree -> one page new_page = WebPage() # Treat the tree with more than # one nodes as a valid tree new_page.root = tree[tree.root] new_page.objs = tree.nodes real_pages.append(new_page) last = tree # Options: process junk web objects: # Add the each object to the nearest # web page of 'real_pages' junk2 = 0 for node in junk_nodes: found_flag = False for page in real_pages[::-1]: if cmp(page.root.start_time, node.start_time) < 0: found_flag = True break if found_flag: page.objs.append(node) else: junk2 += 1 all_real_pages += real_pages[0:1] # little trick: with foreknowledge, the first page is the real page # so we obtain the first one and drop the others as invalid ones. return all_real_pages, all_objects