예제 #1
0
def hosts_handler(page=1, page_size=10, current_host=None, which_collection="crawl-data", filter_field=None, filter_regex=None, show_all=None):
    """Put together host documents for use with hosts endpoint """

    mmu = MemexMongoUtils(which_collection = which_collection)
    # for host in mmu.get_hosts_filtered(filter_field = "host", filter_regex = "windows"):
    #     print "b"
    #     print host

    khc = KnownHostsCompare()

    if current_host:
        current_page_size = page_size_max = 10*100 # max results per page
        host_dics = mmu.list_hosts(page=page, page_size=current_page_size, filter_field=filter_field, filter_regex=filter_regex, show_all=show_all)
        matched=False
        i = 0
        for host_dic in host_dics:
            i += 1
            if host_dic["host"] == current_host:
                matched = True
            if matched and (i % page_size == 0):
                break

        # clean the leftovers
        n = len(host_dics)
        for x in range(i, n):
            host_dics.pop()

    else:
        host_dics = mmu.list_hosts(page=page, page_size=page_size, filter_field=filter_field, filter_regex=filter_regex, show_all=show_all)


    for host_dic in host_dics:

        #print host_dic
        #host scoring is added here as is known hostedness
        host_dic.pop("_id")
        is_known_host = khc.is_known_host(host_dic["host"])
        host_dic["is_known_host"] = is_known_host
        hsu = mmu.get_highest_scoring_url_with_screenshot(host_dic["host"])
        #host_score = mmu.get_host_score(host_dic["host"])
        #host_dic["host_score"] = host_score

        if hsu:
            screenshot_path = get_screenshot_relative_path(hsu['screenshot_path'])
            host_dic["hsu_screenshot_path"] = screenshot_path
        else:
            host_dic["hsu_screenshot_path"] = None

    return host_dics
예제 #2
0
def get_page_number_for_host(path,
                             page_size,
                             current_host,
                             filter_field=None,
                             filter_regex=None,
                             show_all=None):

    which_collection = get_collection_by_path(path)

    mmu = MemexMongoUtils(which_collection=which_collection)

    max_page_size = 100 * 100  # max results per page
    host_dics = mmu.list_hosts(page=1,
                               page_size=max_page_size,
                               filter_field=filter_field,
                               filter_regex=filter_regex,
                               show_all=show_all)
    i = 0
    current_page = 0
    for host_dic in host_dics:
        if host_dic["host"] == current_host:
            current_page = (i / page_size)
            break
        else:
            i += 1

    return current_page
예제 #3
0
def hosts_handler(page = 1, which_collection = "crawl-data", filter_field = None, filter_regex = None):
    """Put together host documents for use with hosts endpoint """

    mmu = MemexMongoUtils(which_collection = which_collection)
    khc = KnownHostsCompare()

    host_dics = mmu.list_hosts(page = page, filter_field = filter_field, filter_regex = filter_regex)

    for host_dic in host_dics:

        #host scoring is added here as is known hostedness
        host_dic.pop("_id")
        is_known_host = khc.is_known_host(host_dic["host"])
        host_dic["is_known_host"] = is_known_host
        hsu = mmu.get_highest_scoring_url_with_screenshot(host_dic["host"])
        host_score = mmu.get_host_score(host_dic["host"])
        host_dic["host_score"] = host_score
        
        if hsu:
            screenshot_path = get_screenshot_relative_path(hsu['screenshot_path'])
            host_dic["hsu_screenshot_path"] = screenshot_path
        else:
            host_dic["hsu_screenshot_path"] = None

    return host_dics
예제 #4
0
def get_page_number_for_host(path, page_size, current_host, filter_field=None, filter_regex=None, show_all=None):

    which_collection = get_collection_by_path(path)

    mmu = MemexMongoUtils(which_collection=which_collection)

    max_page_size = 100*100 # max results per page
    host_dics = mmu.list_hosts(page=1, page_size=max_page_size, filter_field=filter_field, filter_regex=filter_regex, show_all=show_all)
    i = 0
    current_page = 0
    for host_dic in host_dics:
        if host_dic["host"] == current_host:
            current_page = (i/page_size)
            break
        else:
            i += 1

    return current_page
예제 #5
0
def hosts_handler(page=1,
                  page_size=10,
                  current_host=None,
                  which_collection="crawl-data",
                  filter_field=None,
                  filter_regex=None,
                  show_all=None):
    """Put together host documents for use with hosts endpoint """

    mmu = MemexMongoUtils(which_collection=which_collection)
    # for host in mmu.get_hosts_filtered(filter_field = "host", filter_regex = "windows"):
    #     print "b"
    #     print host

    khc = KnownHostsCompare()

    if current_host:
        current_page_size = page_size_max = 10 * 100  # max results per page
        host_dics = mmu.list_hosts(page=page,
                                   page_size=current_page_size,
                                   filter_field=filter_field,
                                   filter_regex=filter_regex,
                                   show_all=show_all)
        matched = False
        i = 0
        for host_dic in host_dics:
            i += 1
            if host_dic["host"] == current_host:
                matched = True
            if matched and (i % page_size == 0):
                break

        # clean the leftovers
        n = len(host_dics)
        for x in range(i, n):
            host_dics.pop()

    else:
        host_dics = mmu.list_hosts(page=page,
                                   page_size=page_size,
                                   filter_field=filter_field,
                                   filter_regex=filter_regex,
                                   show_all=show_all)

    for host_dic in host_dics:

        #print host_dic
        #host scoring is added here as is known hostedness
        host_dic.pop("_id")
        is_known_host = khc.is_known_host(host_dic["host"])
        host_dic["is_known_host"] = is_known_host
        hsu = mmu.get_highest_scoring_url_with_screenshot(host_dic["host"])
        #host_score = mmu.get_host_score(host_dic["host"])
        #host_dic["host_score"] = host_score

        if hsu:
            screenshot_path = get_screenshot_relative_path(
                hsu['screenshot_path'])
            host_dic["hsu_screenshot_path"] = screenshot_path
        else:
            host_dic["hsu_screenshot_path"] = None

    return host_dics