def get_svm_pages(all_objects, valid_urls, predicted_file): (valid_trees, all_pages, junk_nodes) = svm.parse_pages_svm(all_objects, valid_urls) # read pridicted lables all_labels = [i.rstrip(' \r\n') for i in open(predicted_file, 'rb')] tp_pages = [] fp_pages = [] print len(all_pages), len(all_labels) assert len(all_pages) == len(all_labels) for i in range(0, len(all_pages)): if all_labels[i] == '1': if all_pages[i].isvalid: tp_pages.append(all_pages[i]) else: fp_pages.append(all_pages[i]) pos_pages = tp_pages + fp_pages tp_roots = [i.root.identifier for i in tp_pages] fp_roots = [i.root.identifier for i in fp_pages] pos_roots = [i.root.identifier for i in pos_pages] # recut trees using predicted page candidates print 'Predicted pos:', len(pos_roots) recut_pos_pages = [] for tree in valid_trees: local_pos_roots = [i for i in tree.expand_tree(filter = lambda x: x in pos_roots)] for root in local_pos_roots: new_page = WebPage() new_page.add_obj(tree[root], root=True) for node in tree.expand_tree(root, filter = lambda x: x==root or x not in local_pos_roots): new_page.add_obj(tree[node]) recut_pos_pages.append(new_page) recut_pos_pages.sort(lambda x,y: cmp(x,y), lambda x: x.root.start_time, False) # add junk nodes to recut pos pages junk2 = len(junk_nodes) for node in junk_nodes: found_flag = False for page in recut_pos_pages[::-1]: if cmp(page.root.start_time, node.start_time) < 0: found_flag = True break if found_flag: page.junk_objs.append(node) junk2 -= 1 recut_tp_pages = [] recut_fp_pages = [] for page in recut_pos_pages: if page.root.identifier in tp_roots: recut_tp_pages.append(page) elif page.root.identifier in fp_roots: recut_fp_pages.append(page) return recut_pos_pages, recut_tp_pages
def process_tree(tree, k, t): mocs = [] for node in tree.expand_tree(mode=_WIDTH): # must be _WIDTH if tree[node].is_root() and int(tree[node].status) == 200: mocs.append(node) valid = [] for moc in mocs[::-1]: root = tree[moc] bp = tree[moc].bpointer if bp is None: valid.append(moc) else: pred = tree[bp] all_nodes = [] for i in tree.expand_tree(moc,filter=lambda x: x==moc or x not in valid): all_nodes.append(i) if len(all_nodes)>k: if root.start_time - pred.start_time >= datetime.timedelta(seconds=t): valid.append(moc) ###### parse pages pages = [] for rootid in valid[::-1]: new_page = WebPage() new_page.add_obj(tree[rootid], True) pages.append(new_page) for nodeid in tree.expand_tree(rootid, filter = lambda x: x==rootid or x not in valid): new_page.add_obj(tree[nodeid]) return pages
def get_timetype_pages(all_objects, valid_pages): t = 1.2 ################# all_objects.sort(lambda x, y: cmp(x, y), lambda x: x.start_time, False) all_pages = [] last_page = None last_node = None for node in all_objects: if last_page is None and node.is_root(): new_page = WebPage() all_pages.append(new_page) new_page.add_obj(node, root=True) last_page = new_page else: if node.is_root() and \ node.start_time - last_node.start_time >= datetime.timedelta(seconds=t): new_page = WebPage() all_pages.append(new_page) new_page.add_obj(node, root=True) last_page = new_page else: last_page.add_obj(node) last_node = node tp_pages = [] fp_pages = [] for page in all_pages: if page.root.identifier in valid_pages: tp_pages.append(page) else: fp_pages.append(page) return all_pages, tp_pages
def get_timetype_pages(all_objects, valid_pages): t = 1.2 ################# all_objects.sort(lambda x,y: cmp(x,y), lambda x: x.start_time, False) all_pages = [] last_page = None last_node = None for node in all_objects: if last_page is None and node.is_root(): new_page = WebPage() all_pages.append(new_page) new_page.add_obj(node, root = True) last_page = new_page else: if node.is_root() and \ node.start_time - last_node.start_time >= datetime.timedelta(seconds=t): new_page = WebPage() all_pages.append(new_page) new_page.add_obj(node, root = True) last_page = new_page else: last_page.add_obj(node) last_node = node tp_pages = [] fp_pages = [] for page in all_pages: if page.root.identifier in valid_pages: tp_pages.append(page) else: fp_pages.append(page) return all_pages, tp_pages
def process_tree(tree, k, t): mocs = [] for node in tree.expand_tree(mode=_WIDTH): # must be _WIDTH if tree[node].is_root() and int(tree[node].status) == 200: mocs.append(node) valid = [] for moc in mocs[::-1]: root = tree[moc] bp = tree[moc].bpointer if bp is None: valid.append(moc) else: pred = tree[bp] all_nodes = [] for i in tree.expand_tree( moc, filter=lambda x: x == moc or x not in valid): all_nodes.append(i) if len(all_nodes) > k: if root.start_time - pred.start_time >= datetime.timedelta( seconds=t): valid.append(moc) ###### parse pages pages = [] for rootid in valid[::-1]: new_page = WebPage() new_page.add_obj(tree[rootid], True) pages.append(new_page) for nodeid in tree.expand_tree( rootid, filter=lambda x: x == rootid or x not in valid): new_page.add_obj(tree[nodeid]) return pages
def get_type_pages(all_nodes, valid_pages): all_nodes.sort(lambda x,y: cmp(x,y), lambda x: x.start_time, False) all_pages = [] last_page = None for node in all_nodes: if node.is_root(): new_page = WebPage() new_page.add_obj(node, root = True) all_pages.append(new_page) last_page = new_page else: if last_page is not None: last_page.add_obj(node) tp_pages = [] fp_pages = [] for page in all_pages: if page.root.identifier in valid_pages: tp_pages.append(page) else: fp_pages.append(page) return all_pages, tp_pages
def get_type_pages(all_nodes, valid_pages): all_nodes.sort(lambda x, y: cmp(x, y), lambda x: x.start_time, False) all_pages = [] last_page = None for node in all_nodes: if node.is_root(): new_page = WebPage() new_page.add_obj(node, root=True) all_pages.append(new_page) last_page = new_page else: if last_page is not None: last_page.add_obj(node) tp_pages = [] fp_pages = [] for page in all_pages: if page.root.identifier in valid_pages: tp_pages.append(page) else: fp_pages.append(page) return all_pages, tp_pages
def get_svm_pages(all_objects, valid_urls, predicted_file): (valid_trees, all_pages, junk_nodes) = svm.parse_pages_svm(all_objects, valid_urls) # read pridicted lables all_labels = [i.rstrip(' \r\n') for i in open(predicted_file, 'rb')] tp_pages = [] fp_pages = [] print len(all_pages), len(all_labels) assert len(all_pages) == len(all_labels) for i in range(0, len(all_pages)): if all_labels[i] == '1': if all_pages[i].isvalid: tp_pages.append(all_pages[i]) else: fp_pages.append(all_pages[i]) pos_pages = tp_pages + fp_pages tp_roots = [i.root.identifier for i in tp_pages] fp_roots = [i.root.identifier for i in fp_pages] pos_roots = [i.root.identifier for i in pos_pages] # recut trees using predicted page candidates print 'Predicted pos:', len(pos_roots) recut_pos_pages = [] for tree in valid_trees: local_pos_roots = [ i for i in tree.expand_tree(filter=lambda x: x in pos_roots) ] for root in local_pos_roots: new_page = WebPage() new_page.add_obj(tree[root], root=True) for node in tree.expand_tree( root, filter=lambda x: x == root or x not in local_pos_roots): new_page.add_obj(tree[node]) recut_pos_pages.append(new_page) recut_pos_pages.sort(lambda x, y: cmp(x, y), lambda x: x.root.start_time, False) # add junk nodes to recut pos pages junk2 = len(junk_nodes) for node in junk_nodes: found_flag = False for page in recut_pos_pages[::-1]: if cmp(page.root.start_time, node.start_time) < 0: found_flag = True break if found_flag: page.junk_objs.append(node) junk2 -= 1 recut_tp_pages = [] recut_fp_pages = [] for page in recut_pos_pages: if page.root.identifier in tp_roots: recut_tp_pages.append(page) elif page.root.identifier in fp_roots: recut_fp_pages.append(page) return recut_pos_pages, recut_tp_pages
print 'log file: %s' % this_log print 'Reading log...' all_lines = logbasic.read(input_file) print 'Processing rrp...' all_nodes = [] for line in all_lines: all_nodes.append(logbasic.NodeFromLog(line)) all_nodes.sort(lambda x,y: cmp(x,y), lambda x: x.start_time, False) all_pages = [] last_page = None for node in all_nodes: if node.is_root(): new_page = WebPage() new_page.add_obj(node, root = True) all_pages.append(new_page) last_page = new_page else: if last_page is not None: last_page.add_obj(node) print len(all_nodes) print len(all_pages) all_urls = [i.root.url for i in all_pages] ofile = open(detected_pageurl, 'wb') ofile.write('\n'.join(all_urls)) ofile.close()
for line in all_lines: all_nodes.append(logbasic.NodeFromLog(line)) all_nodes.sort(lambda x, y: cmp(x, y), lambda x: x.start_time, False) print len(all_nodes) T = [i / 10.0 for i in range(2, 202, 2)] for t in T: log_h.log('########################\n') all_pages = [] last_page = None last_node = None for node in all_nodes: if last_page is None: new_page = WebPage() all_pages.append(new_page) new_page.add_obj(node, root=True) last_page = new_page else: if node.start_time - last_node.start_time >= datetime.timedelta( seconds=t): new_page = WebPage() all_pages.append(new_page) new_page.add_obj(node, root=True) last_page = new_page else: last_page.add_obj(node) last_node = node print 'Page count: %d' % len(all_pages)