Пример #1
0
def get_timetype_pages(all_objects, valid_pages):
    t = 1.2  #################

    all_objects.sort(lambda x, y: cmp(x, y), lambda x: x.start_time, False)

    all_pages = []
    last_page = None
    last_node = None
    for node in all_objects:
        if last_page is None and node.is_root():
            new_page = WebPage()
            all_pages.append(new_page)
            new_page.add_obj(node, root=True)
            last_page = new_page
        else:
            if node.is_root() and \
            node.start_time - last_node.start_time >= datetime.timedelta(seconds=t):
                new_page = WebPage()
                all_pages.append(new_page)
                new_page.add_obj(node, root=True)
                last_page = new_page
            else:
                last_page.add_obj(node)
        last_node = node

    tp_pages = []
    fp_pages = []
    for page in all_pages:
        if page.root.identifier in valid_pages:
            tp_pages.append(page)
        else:
            fp_pages.append(page)
    return all_pages, tp_pages
Пример #2
0
def process_tree(tree, k, t):
    mocs = []
    for node in tree.expand_tree(mode=_WIDTH):  # must be _WIDTH
        if tree[node].is_root() and int(tree[node].status) == 200:
            mocs.append(node)

    valid = []
    for moc in mocs[::-1]:
        root = tree[moc]
        bp = tree[moc].bpointer
        if bp is None:
            valid.append(moc)
        else:
            pred = tree[bp]
            all_nodes = []
            for i in tree.expand_tree(
                    moc, filter=lambda x: x == moc or x not in valid):
                all_nodes.append(i)

            if len(all_nodes) > k:
                if root.start_time - pred.start_time >= datetime.timedelta(
                        seconds=t):
                    valid.append(moc)

    ###### parse pages
    pages = []
    for rootid in valid[::-1]:
        new_page = WebPage()
        new_page.add_obj(tree[rootid], True)
        pages.append(new_page)
        for nodeid in tree.expand_tree(
                rootid, filter=lambda x: x == rootid or x not in valid):
            new_page.add_obj(tree[nodeid])

    return pages
Пример #3
0
def get_type_pages(all_nodes, valid_pages):
    all_nodes.sort(lambda x, y: cmp(x, y), lambda x: x.start_time, False)

    all_pages = []
    last_page = None
    for node in all_nodes:
        if node.is_root():
            new_page = WebPage()
            new_page.add_obj(node, root=True)
            all_pages.append(new_page)
            last_page = new_page
        else:
            if last_page is not None:
                last_page.add_obj(node)

    tp_pages = []
    fp_pages = []
    for page in all_pages:
        if page.root.identifier in valid_pages:
            tp_pages.append(page)
        else:
            fp_pages.append(page)
    return all_pages, tp_pages
Пример #4
0
def get_svm_pages(all_objects, valid_urls, predicted_file):

    (valid_trees, all_pages,
     junk_nodes) = svm.parse_pages_svm(all_objects, valid_urls)

    # read pridicted lables
    all_labels = [i.rstrip(' \r\n') for i in open(predicted_file, 'rb')]
    tp_pages = []
    fp_pages = []

    print len(all_pages), len(all_labels)
    assert len(all_pages) == len(all_labels)

    for i in range(0, len(all_pages)):
        if all_labels[i] == '1':
            if all_pages[i].isvalid:
                tp_pages.append(all_pages[i])
            else:
                fp_pages.append(all_pages[i])

    pos_pages = tp_pages + fp_pages
    tp_roots = [i.root.identifier for i in tp_pages]
    fp_roots = [i.root.identifier for i in fp_pages]
    pos_roots = [i.root.identifier for i in pos_pages]

    # recut trees using predicted page candidates
    print 'Predicted pos:', len(pos_roots)
    recut_pos_pages = []
    for tree in valid_trees:
        local_pos_roots = [
            i for i in tree.expand_tree(filter=lambda x: x in pos_roots)
        ]
        for root in local_pos_roots:
            new_page = WebPage()
            new_page.add_obj(tree[root], root=True)
            for node in tree.expand_tree(
                    root,
                    filter=lambda x: x == root or x not in local_pos_roots):
                new_page.add_obj(tree[node])
            recut_pos_pages.append(new_page)

    recut_pos_pages.sort(lambda x, y: cmp(x, y), lambda x: x.root.start_time,
                         False)

    # add junk nodes to recut pos pages
    junk2 = len(junk_nodes)
    for node in junk_nodes:
        found_flag = False
        for page in recut_pos_pages[::-1]:
            if cmp(page.root.start_time, node.start_time) < 0:
                found_flag = True
                break
        if found_flag:
            page.junk_objs.append(node)
            junk2 -= 1

    recut_tp_pages = []
    recut_fp_pages = []
    for page in recut_pos_pages:
        if page.root.identifier in tp_roots:
            recut_tp_pages.append(page)
        elif page.root.identifier in fp_roots:
            recut_fp_pages.append(page)

    return recut_pos_pages, recut_tp_pages
Пример #5
0
for line in all_lines:
    all_nodes.append(logbasic.NodeFromLog(line))
all_nodes.sort(lambda x, y: cmp(x, y), lambda x: x.start_time, False)

print len(all_nodes)

T = [i / 10.0 for i in range(2, 202, 2)]

for t in T:
    log_h.log('########################\n')
    all_pages = []
    last_page = None
    last_node = None
    for node in all_nodes:
        if last_page is None:
            new_page = WebPage()
            all_pages.append(new_page)
            new_page.add_obj(node, root=True)
            last_page = new_page
        else:
            if node.start_time - last_node.start_time >= datetime.timedelta(
                    seconds=t):
                new_page = WebPage()
                all_pages.append(new_page)
                new_page.add_obj(node, root=True)
                last_page = new_page
            else:
                last_page.add_obj(node)
        last_node = node

    print 'Page count: %d' % len(all_pages)