示例#1
0
def get_page_number_for_host(path,
                             page_size,
                             current_host,
                             filter_field=None,
                             filter_regex=None,
                             show_all=None):

    which_collection = get_collection_by_path(path)

    mmu = MemexMongoUtils(which_collection=which_collection)

    max_page_size = 100 * 100  # max results per page
    host_dics = mmu.list_hosts(page=1,
                               page_size=max_page_size,
                               filter_field=filter_field,
                               filter_regex=filter_regex,
                               show_all=show_all)
    i = 0
    current_page = 0
    for host_dic in host_dics:
        if host_dic["host"] == current_host:
            current_page = (i / page_size)
            break
        else:
            i += 1

    return current_page
示例#2
0
def hosts_handler(page = 1, which_collection = "crawl-data", filter_field = None, filter_regex = None):
    """Put together host documents for use with hosts endpoint """

    mmu = MemexMongoUtils(which_collection = which_collection)
    khc = KnownHostsCompare()

    host_dics = mmu.list_hosts(page = page, filter_field = filter_field, filter_regex = filter_regex)

    for host_dic in host_dics:

        #host scoring is added here as is known hostedness
        host_dic.pop("_id")
        is_known_host = khc.is_known_host(host_dic["host"])
        host_dic["is_known_host"] = is_known_host
        hsu = mmu.get_highest_scoring_url_with_screenshot(host_dic["host"])
        host_score = mmu.get_host_score(host_dic["host"])
        host_dic["host_score"] = host_score
        
        if hsu:
            screenshot_path = get_screenshot_relative_path(hsu['screenshot_path'])
            host_dic["hsu_screenshot_path"] = screenshot_path
        else:
            host_dic["hsu_screenshot_path"] = None

    return host_dics
示例#3
0
def get_job_state_handler(url, spider_host = "localhost", spider_port = "6800"):

    mmu = MemexMongoUtils()
    scrapyd_util = ScrapydJob(spider_host, spider_port)
    job_id = mmu.get_seed_doc(url)["job_id"]

    return scrapyd_util.get_state(job_id)
示例#4
0
def search_tags(term):
    mmu = MemexMongoUtils()
    search_results_docs = mmu.search_tags(term)
    tag_search_results_docs = search_results_docs["tag_matches"]
    for doc in tag_search_results_docs:
        _add_filtered_tags(doc, term)

    return search_results_docs
示例#5
0
def search_tags(term):
    mmu = MemexMongoUtils()
    search_results_docs = mmu.search_tags(term)
    tag_search_results_docs = search_results_docs["tag_matches"]
    for doc in tag_search_results_docs:
        _add_filtered_tags(doc, term)

    return search_results_docs
示例#6
0
def schedule_spider_handler(seed, spider_host = "localhost", spider_port = "6800"):

    mmu = MemexMongoUtils()
    scrapyd_util = ScrapydJob(spider_host, spider_port, screenshot_dir = SCREENSHOT_DIR)
    job_id = scrapyd_util.schedule(seed)
    mmu.add_job(seed, job_id)

    return True
示例#7
0
def add_known_urls_handler(urls_raw):

    mmu = MemexMongoUtils(which_collection="known-data")
    for url in urls_raw.splitlines():
        validate_url(url)
        try:
            mmu.insert_url(url=url)
        except:
            print "Existing URL attempted to be uploaded, skipping it..."
示例#8
0
def mark_interest_handler(interest, url):

    mmu = MemexMongoUtils()
    if interest:
        mmu.set_interest(url, True)

    else:
        #if user marks url as uninteresting, score drops to 0
        mmu.set_interest(url, False)
示例#9
0
def add_known_urls_handler(urls_raw):

    mmu = MemexMongoUtils(which_collection = "known-data")
    for url in urls_raw.splitlines():
        validate_url(url)
        try:
            mmu.insert_url(url = url)
        except:
            print "Existing URL attempted to be uploaded, skipping it..."
示例#10
0
def schedule_spider_handler(seed, spider_host = "localhost", spider_port = "6800"):

    mmu = MemexMongoUtils()
    scrapyd_util = ScrapydJob(spider_host, spider_port, project = "discovery-project", spider='topical_finder',
                              screenshot_dir = SCREENSHOT_DIR)
    job_id = scrapyd_util.schedule(seed)
    mmu.add_job(seed, job_id, project = "discovery-project", spider = "topical_finder")

    return True
示例#11
0
def get_job_state_handler(url, spider_host="localhost", spider_port="6800"):

    mmu = MemexMongoUtils()
    seed_doc = mmu.get_seed_doc(url)
    job_id = seed_doc["job_id"]
    project = seed_doc["project"]

    scrapyd_util = ScrapydJob(spider_host, spider_port, project=project)

    return scrapyd_util.get_state(job_id)
示例#12
0
def get_job_state_handler(url, spider_host = "localhost", spider_port = "6800"):

    mmu = MemexMongoUtils()
    seed_doc = mmu.get_seed_doc(url)
    job_id = seed_doc["job_id"]
    project = seed_doc["project"]

    scrapyd_util = ScrapydJob(spider_host, spider_port, project = project)

    return scrapyd_util.get_state(job_id)
示例#13
0
def schedule_spider_searchengine_handler(search_terms, spider_host="localhost", spider_port="6800"):
    mmu = MemexMongoUtils()
    project = "searchengine-project"
    spider = "google.com"
    scrapyd_util = ScrapydJob(
        scrapyd_host=spider_host,
        scrapyd_port=spider_port,
        project=project,
        spider=spider,
        screenshot_dir=SCREENSHOT_DIR,
    )
    job_id = scrapyd_util.schedule_keywords(search_terms)
    mmu.add_job(search_terms, job_id, project=project, spider=spider)
示例#14
0
def hosts_handler(page=1, page_size=10, current_host=None, which_collection="crawl-data", filter_field=None, filter_regex=None, show_all=None):
    """Put together host documents for use with hosts endpoint """

    mmu = MemexMongoUtils(which_collection = which_collection)
    # for host in mmu.get_hosts_filtered(filter_field = "host", filter_regex = "windows"):
    #     print "b"
    #     print host

    khc = KnownHostsCompare()

    if current_host:
        current_page_size = page_size_max = 10*100 # max results per page
        host_dics = mmu.list_hosts(page=page, page_size=current_page_size, filter_field=filter_field, filter_regex=filter_regex, show_all=show_all)
        matched=False
        i = 0
        for host_dic in host_dics:
            i += 1
            if host_dic["host"] == current_host:
                matched = True
            if matched and (i % page_size == 0):
                break

        # clean the leftovers
        n = len(host_dics)
        for x in range(i, n):
            host_dics.pop()

    else:
        host_dics = mmu.list_hosts(page=page, page_size=page_size, filter_field=filter_field, filter_regex=filter_regex, show_all=show_all)


    for host_dic in host_dics:

        #print host_dic
        #host scoring is added here as is known hostedness
        host_dic.pop("_id")
        is_known_host = khc.is_known_host(host_dic["host"])
        host_dic["is_known_host"] = is_known_host
        hsu = mmu.get_highest_scoring_url_with_screenshot(host_dic["host"])
        #host_score = mmu.get_host_score(host_dic["host"])
        #host_dic["host_score"] = host_score

        if hsu:
            screenshot_path = get_screenshot_relative_path(hsu['screenshot_path'])
            host_dic["hsu_screenshot_path"] = screenshot_path
        else:
            host_dic["hsu_screenshot_path"] = None

    return host_dics
示例#15
0
def urls_handler(host=None, which_collection="crawl-data"):
    """Put together host documents for use with hosts endpoint """

    mmu = MemexMongoUtils(which_collection=which_collection)
    url_dics = mmu.list_urls(host=host, limit=1000)

    for url_dic in url_dics:
        url_dic.pop("_id")
        try:
            date = url_dic["crawled_at"]
            url_dic["crawled_at"] = date.strftime("%Y-%m-%d %H:%M:%S")
        except:
            url_dic["crawled_at"] = None

    return url_dics
示例#16
0
def schedule_spider_handler(seed, spider_host="localhost", spider_port="6800"):

    mmu = MemexMongoUtils()
    scrapyd_util = ScrapydJob(spider_host,
                              spider_port,
                              project="discovery-project",
                              spider='topical_finder',
                              screenshot_dir=SCREENSHOT_DIR)
    job_id = scrapyd_util.schedule(seed)
    mmu.add_job(seed,
                job_id,
                project="discovery-project",
                spider="topical_finder")

    return True
示例#17
0
def schedule_spider_searchengine_handler(search_terms,
                                         spider_host="localhost",
                                         spider_port="6800"):
    mmu = MemexMongoUtils()
    project = "searchengine-project"
    spider = "google.com"
    scrapyd_util = ScrapydJob(
        scrapyd_host=spider_host,
        scrapyd_port=spider_port,
        project=project,
        spider=spider,
        screenshot_dir=SCREENSHOT_DIR,
    )
    job_id = scrapyd_util.schedule_keywords(search_terms)
    mmu.add_job(search_terms, job_id, project=project, spider=spider)
示例#18
0
def urls_handler(host = None, which_collection  = "crawl-data"):
    """Put together host documents for use with hosts endpoint """

    mmu = MemexMongoUtils(which_collection = which_collection)
    url_dics = mmu.list_urls(host = host, limit = 1000)

    for url_dic in url_dics:
        url_dic.pop("_id")
        try:
            date = url_dic["crawled_at"]
            url_dic["crawled_at"] = date.strftime("%Y-%m-%d %H:%M:%S")
        except:
            url_dic["crawled_at"] = None

    return url_dics
示例#19
0
def urls_handler(host = None, which_collection  = "crawl-data"):
    """Put together host documents for use with hosts endpoint """

    mmu = MemexMongoUtils(which_collection = which_collection)
    url_dics = mmu.list_urls(host = host)

    for url_dic in url_dics:
        url_dic.pop("_id")
        date = url_dic["crawled_at"]
        try:
            url_dic["crawled_at"] = date.isoformat()
        except:
            url_dic["crawled_at"] = str(date)

    return url_dics
示例#20
0
def save_display(host, displayable):
    mmu = MemexMongoUtils()
    if not bool(displayable):
        for url_doc in mmu.list_urls(host=host, limit=100000000):
            mmu.set_interest(url_doc["url"], False)

    else:

        for url_doc in mmu.list_urls(host=host, limit=100000000):
            mmu.set_interest(url_doc["url"], True)

    return mmu.save_display(host, displayable)
示例#21
0
def get_page_number_for_host(path, page_size, current_host, filter_field=None, filter_regex=None, show_all=None):

    which_collection = get_collection_by_path(path)

    mmu = MemexMongoUtils(which_collection=which_collection)

    max_page_size = 100*100 # max results per page
    host_dics = mmu.list_hosts(page=1, page_size=max_page_size, filter_field=filter_field, filter_regex=filter_regex, show_all=show_all)
    i = 0
    current_page = 0
    for host_dic in host_dics:
        if host_dic["host"] == current_host:
            current_page = (i/page_size)
            break
        else:
            i += 1

    return current_page
示例#22
0
def save_display(host, displayable):
    mmu = MemexMongoUtils()
    if not bool(displayable):
        for url_doc in mmu.list_urls(host = host, limit=100000000):
            mmu.set_interest(url_doc["url"], False)

    else:

        for url_doc in mmu.list_urls(host = host, limit=100000000):
            mmu.set_interest(url_doc["url"], True)

    return mmu.save_display(host, displayable)
示例#23
0
def mark_interest_handler(interest, url):

    mmu = MemexMongoUtils()
    if interest:
        mmu.set_interest(url, True)

    else:
        #if user marks url as uninteresting, score drops to 0
        mmu.set_interest(url, False)
示例#24
0
def save_blur_level(level):
    mmu = MemexMongoUtils()
    mmu.save_blur_level(level)
示例#25
0
def save_search_term(list):
    mmu = MemexMongoUtils()
    mmu.save_search_term(list)
示例#26
0
def save_search_term(list):
    mmu = MemexMongoUtils()
    mmu.save_search_term(list)
示例#27
0
def get_score_handler():

    mmu = MemexMongoUtils()
    yes_interest_docs = mmu.list_all_urls_with_interest(True, return_html=True)
    no_interest_docs = mmu.list_all_urls_with_interest(False, return_html=True)
    return yes_interest_docs, no_interest_docs
示例#28
0
def set_workspace_selected(id):
    mmu = MemexMongoUtils()
    mmu.set_workspace_selected(id)
示例#29
0
def get_score_handler():

    mmu = MemexMongoUtils()
    yes_interest_docs = mmu.list_all_urls_with_interest(True, return_html = True)
    no_interest_docs = mmu.list_all_urls_with_interest(False, return_html = True)
    return yes_interest_docs, no_interest_docs
示例#30
0
def add_workspace(name):
    mmu = MemexMongoUtils()
    mmu.add_workspace(name)
示例#31
0
def list_workspace():
    mmu = MemexMongoUtils()
    return mmu.list_workspace()
示例#32
0
def delete_workspace(id):
    mmu = MemexMongoUtils()
    mmu.delete_workspace(id)
示例#33
0
def set_workspace_selected(id):
    mmu = MemexMongoUtils()
    mmu.set_workspace_selected(id)
示例#34
0
def list_keyword():
    mmu = MemexMongoUtils()
    return mmu.list_keyword()
示例#35
0
def save_blur_level(level):
    mmu = MemexMongoUtils()
    mmu.save_blur_level(level)
示例#36
0
def get_blur_level():
    mmu = MemexMongoUtils()
    return mmu.get_blur_level()
示例#37
0
def save_tags(host, tags):
    mmu = MemexMongoUtils()
    mmu.save_tags(host, tags)
示例#38
0
def list_workspace():
    mmu = MemexMongoUtils()
    return mmu.list_workspace()
示例#39
0
def list_tags(host):
    mmu = MemexMongoUtils()
    return mmu.list_tags(host)
示例#40
0
def list_keyword():
    mmu = MemexMongoUtils()
    return mmu.list_keyword()
示例#41
0
def add_workspace(name):
    mmu = MemexMongoUtils()
    mmu.add_workspace(name)
示例#42
0
def set_score_handler(url, score):
    mmu = MemexMongoUtils()
    mmu.set_score(url, score)
示例#43
0
def delete_workspace(id):
    mmu = MemexMongoUtils()
    mmu.delete_workspace(id)
示例#44
0
def save_keyword(list):
    mmu = MemexMongoUtils()
    mmu.save_keyword(list)
示例#45
0
def save_keyword(list):
    mmu = MemexMongoUtils()
    mmu.save_keyword(list)
示例#46
0
def discovery_handler():

    mmu = MemexMongoUtils()
    seeds = mmu.list_seeds()
    return seeds
示例#47
0
def list_search_term():
    mmu = MemexMongoUtils()
    return mmu.list_search_term()
示例#48
0
def set_score_handler(url, score):
    mmu = MemexMongoUtils()
    mmu.set_score(url, score)
示例#49
0
def discovery_handler():

    mmu = MemexMongoUtils()
    seeds = mmu.list_seeds()
    return seeds
示例#50
0
def list_tags(host):
    mmu = MemexMongoUtils()
    return mmu.list_tags(host)
示例#51
0
def get_blur_level():
    mmu = MemexMongoUtils()
    return mmu.get_blur_level()
示例#52
0
def save_tags(host, tags):
    mmu = MemexMongoUtils()
    mmu.save_tags(host, tags)
示例#53
0
def hosts_handler(page=1,
                  page_size=10,
                  current_host=None,
                  which_collection="crawl-data",
                  filter_field=None,
                  filter_regex=None,
                  show_all=None):
    """Put together host documents for use with hosts endpoint """

    mmu = MemexMongoUtils(which_collection=which_collection)
    # for host in mmu.get_hosts_filtered(filter_field = "host", filter_regex = "windows"):
    #     print "b"
    #     print host

    khc = KnownHostsCompare()

    if current_host:
        current_page_size = page_size_max = 10 * 100  # max results per page
        host_dics = mmu.list_hosts(page=page,
                                   page_size=current_page_size,
                                   filter_field=filter_field,
                                   filter_regex=filter_regex,
                                   show_all=show_all)
        matched = False
        i = 0
        for host_dic in host_dics:
            i += 1
            if host_dic["host"] == current_host:
                matched = True
            if matched and (i % page_size == 0):
                break

        # clean the leftovers
        n = len(host_dics)
        for x in range(i, n):
            host_dics.pop()

    else:
        host_dics = mmu.list_hosts(page=page,
                                   page_size=page_size,
                                   filter_field=filter_field,
                                   filter_regex=filter_regex,
                                   show_all=show_all)

    for host_dic in host_dics:

        #print host_dic
        #host scoring is added here as is known hostedness
        host_dic.pop("_id")
        is_known_host = khc.is_known_host(host_dic["host"])
        host_dic["is_known_host"] = is_known_host
        hsu = mmu.get_highest_scoring_url_with_screenshot(host_dic["host"])
        #host_score = mmu.get_host_score(host_dic["host"])
        #host_dic["host_score"] = host_score

        if hsu:
            screenshot_path = get_screenshot_relative_path(
                hsu['screenshot_path'])
            host_dic["hsu_screenshot_path"] = screenshot_path
        else:
            host_dic["hsu_screenshot_path"] = None

    return host_dics
示例#54
0
def list_search_term():
    mmu = MemexMongoUtils()
    return mmu.list_search_term()