Python MemexMongoUtils示例，mongoutils.memex_mongo_utils.MemexMongoUtils Python示例

示例#1

0

显示文件

文件： handlers.py 项目： hoardboard/memex-pinterest

def get_page_number_for_host(path,
                             page_size,
                             current_host,
                             filter_field=None,
                             filter_regex=None,
                             show_all=None):

    which_collection = get_collection_by_path(path)

    mmu = MemexMongoUtils(which_collection=which_collection)

    max_page_size = 100 * 100  # max results per page
    host_dics = mmu.list_hosts(page=1,
                               page_size=max_page_size,
                               filter_field=filter_field,
                               filter_regex=filter_regex,
                               show_all=show_all)
    i = 0
    current_page = 0
    for host_dic in host_dics:
        if host_dic["host"] == current_host:
            current_page = (i / page_size)
            break
        else:
            i += 1

    return current_page

示例#2

0

显示文件

文件： handlers.py 项目： k9team3/memex-pinterest

def hosts_handler(page = 1, which_collection = "crawl-data", filter_field = None, filter_regex = None):
    """Put together host documents for use with hosts endpoint """

    mmu = MemexMongoUtils(which_collection = which_collection)
    khc = KnownHostsCompare()

    host_dics = mmu.list_hosts(page = page, filter_field = filter_field, filter_regex = filter_regex)

    for host_dic in host_dics:

        #host scoring is added here as is known hostedness
        host_dic.pop("_id")
        is_known_host = khc.is_known_host(host_dic["host"])
        host_dic["is_known_host"] = is_known_host
        hsu = mmu.get_highest_scoring_url_with_screenshot(host_dic["host"])
        host_score = mmu.get_host_score(host_dic["host"])
        host_dic["host_score"] = host_score
        
        if hsu:
            screenshot_path = get_screenshot_relative_path(hsu['screenshot_path'])
            host_dic["hsu_screenshot_path"] = screenshot_path
        else:
            host_dic["hsu_screenshot_path"] = None

    return host_dics

示例#3

0

显示文件

文件： handlers.py 项目： k9team3/memex-pinterest

def get_job_state_handler(url, spider_host = "localhost", spider_port = "6800"):

    mmu = MemexMongoUtils()
    scrapyd_util = ScrapydJob(spider_host, spider_port)
    job_id = mmu.get_seed_doc(url)["job_id"]

    return scrapyd_util.get_state(job_id)

示例#4

0

显示文件

文件： handlers.py 项目： X01VVD01X/memex-pinterest

def search_tags(term):
    mmu = MemexMongoUtils()
    search_results_docs = mmu.search_tags(term)
    tag_search_results_docs = search_results_docs["tag_matches"]
    for doc in tag_search_results_docs:
        _add_filtered_tags(doc, term)

    return search_results_docs

示例#5

0

显示文件

文件： handlers.py 项目： hoardboard/memex-pinterest

def search_tags(term):
    mmu = MemexMongoUtils()
    search_results_docs = mmu.search_tags(term)
    tag_search_results_docs = search_results_docs["tag_matches"]
    for doc in tag_search_results_docs:
        _add_filtered_tags(doc, term)

    return search_results_docs

示例#6

0

显示文件

文件： handlers.py 项目： k9team3/memex-pinterest

def schedule_spider_handler(seed, spider_host = "localhost", spider_port = "6800"):

    mmu = MemexMongoUtils()
    scrapyd_util = ScrapydJob(spider_host, spider_port, screenshot_dir = SCREENSHOT_DIR)
    job_id = scrapyd_util.schedule(seed)
    mmu.add_job(seed, job_id)

    return True

示例#7

0

显示文件

文件： handlers.py 项目： hoardboard/memex-pinterest

def add_known_urls_handler(urls_raw):

    mmu = MemexMongoUtils(which_collection="known-data")
    for url in urls_raw.splitlines():
        validate_url(url)
        try:
            mmu.insert_url(url=url)
        except:
            print "Existing URL attempted to be uploaded, skipping it..."

示例#8

0

显示文件

文件： handlers.py 项目： X01VVD01X/memex-pinterest

def mark_interest_handler(interest, url):

    mmu = MemexMongoUtils()
    if interest:
        mmu.set_interest(url, True)

    else:
        #if user marks url as uninteresting, score drops to 0
        mmu.set_interest(url, False)

示例#9

0

显示文件

文件： handlers.py 项目： X01VVD01X/memex-pinterest

def add_known_urls_handler(urls_raw):

    mmu = MemexMongoUtils(which_collection = "known-data")
    for url in urls_raw.splitlines():
        validate_url(url)
        try:
            mmu.insert_url(url = url)
        except:
            print "Existing URL attempted to be uploaded, skipping it..."

示例#10

0

显示文件

文件： handlers.py 项目： X01VVD01X/memex-pinterest

def schedule_spider_handler(seed, spider_host = "localhost", spider_port = "6800"):

    mmu = MemexMongoUtils()
    scrapyd_util = ScrapydJob(spider_host, spider_port, project = "discovery-project", spider='topical_finder',
                              screenshot_dir = SCREENSHOT_DIR)
    job_id = scrapyd_util.schedule(seed)
    mmu.add_job(seed, job_id, project = "discovery-project", spider = "topical_finder")

    return True

示例#11

0

显示文件

文件： handlers.py 项目： hoardboard/memex-pinterest

def get_job_state_handler(url, spider_host="localhost", spider_port="6800"):

    mmu = MemexMongoUtils()
    seed_doc = mmu.get_seed_doc(url)
    job_id = seed_doc["job_id"]
    project = seed_doc["project"]

    scrapyd_util = ScrapydJob(spider_host, spider_port, project=project)

    return scrapyd_util.get_state(job_id)

示例#12

0

显示文件

文件： handlers.py 项目： X01VVD01X/memex-pinterest

def get_job_state_handler(url, spider_host = "localhost", spider_port = "6800"):

    mmu = MemexMongoUtils()
    seed_doc = mmu.get_seed_doc(url)
    job_id = seed_doc["job_id"]
    project = seed_doc["project"]

    scrapyd_util = ScrapydJob(spider_host, spider_port, project = project)

    return scrapyd_util.get_state(job_id)

示例#13

0

显示文件

文件： handlers.py 项目： X01VVD01X/memex-pinterest

def schedule_spider_searchengine_handler(search_terms, spider_host="localhost", spider_port="6800"):
    mmu = MemexMongoUtils()
    project = "searchengine-project"
    spider = "google.com"
    scrapyd_util = ScrapydJob(
        scrapyd_host=spider_host,
        scrapyd_port=spider_port,
        project=project,
        spider=spider,
        screenshot_dir=SCREENSHOT_DIR,
    )
    job_id = scrapyd_util.schedule_keywords(search_terms)
    mmu.add_job(search_terms, job_id, project=project, spider=spider)

示例#14

0

显示文件

文件： handlers.py 项目： X01VVD01X/memex-pinterest

def hosts_handler(page=1, page_size=10, current_host=None, which_collection="crawl-data", filter_field=None, filter_regex=None, show_all=None):
    """Put together host documents for use with hosts endpoint """

    mmu = MemexMongoUtils(which_collection = which_collection)
    # for host in mmu.get_hosts_filtered(filter_field = "host", filter_regex = "windows"):
    #     print "b"
    #     print host

    khc = KnownHostsCompare()

    if current_host:
        current_page_size = page_size_max = 10*100 # max results per page
        host_dics = mmu.list_hosts(page=page, page_size=current_page_size, filter_field=filter_field, filter_regex=filter_regex, show_all=show_all)
        matched=False
        i = 0
        for host_dic in host_dics:
            i += 1
            if host_dic["host"] == current_host:
                matched = True
            if matched and (i % page_size == 0):
                break

        # clean the leftovers
        n = len(host_dics)
        for x in range(i, n):
            host_dics.pop()

    else:
        host_dics = mmu.list_hosts(page=page, page_size=page_size, filter_field=filter_field, filter_regex=filter_regex, show_all=show_all)


    for host_dic in host_dics:

        #print host_dic
        #host scoring is added here as is known hostedness
        host_dic.pop("_id")
        is_known_host = khc.is_known_host(host_dic["host"])
        host_dic["is_known_host"] = is_known_host
        hsu = mmu.get_highest_scoring_url_with_screenshot(host_dic["host"])
        #host_score = mmu.get_host_score(host_dic["host"])
        #host_dic["host_score"] = host_score

        if hsu:
            screenshot_path = get_screenshot_relative_path(hsu['screenshot_path'])
            host_dic["hsu_screenshot_path"] = screenshot_path
        else:
            host_dic["hsu_screenshot_path"] = None

    return host_dics

示例#15

0

显示文件

文件： handlers.py 项目： hoardboard/memex-pinterest

def urls_handler(host=None, which_collection="crawl-data"):
    """Put together host documents for use with hosts endpoint """

    mmu = MemexMongoUtils(which_collection=which_collection)
    url_dics = mmu.list_urls(host=host, limit=1000)

    for url_dic in url_dics:
        url_dic.pop("_id")
        try:
            date = url_dic["crawled_at"]
            url_dic["crawled_at"] = date.strftime("%Y-%m-%d %H:%M:%S")
        except:
            url_dic["crawled_at"] = None

    return url_dics

示例#16

0

显示文件

文件： handlers.py 项目： hoardboard/memex-pinterest

def schedule_spider_handler(seed, spider_host="localhost", spider_port="6800"):

    mmu = MemexMongoUtils()
    scrapyd_util = ScrapydJob(spider_host,
                              spider_port,
                              project="discovery-project",
                              spider='topical_finder',
                              screenshot_dir=SCREENSHOT_DIR)
    job_id = scrapyd_util.schedule(seed)
    mmu.add_job(seed,
                job_id,
                project="discovery-project",
                spider="topical_finder")

    return True

示例#17

0

显示文件

文件： handlers.py 项目： hoardboard/memex-pinterest

def schedule_spider_searchengine_handler(search_terms,
                                         spider_host="localhost",
                                         spider_port="6800"):
    mmu = MemexMongoUtils()
    project = "searchengine-project"
    spider = "google.com"
    scrapyd_util = ScrapydJob(
        scrapyd_host=spider_host,
        scrapyd_port=spider_port,
        project=project,
        spider=spider,
        screenshot_dir=SCREENSHOT_DIR,
    )
    job_id = scrapyd_util.schedule_keywords(search_terms)
    mmu.add_job(search_terms, job_id, project=project, spider=spider)

示例#18

0

显示文件

文件： handlers.py 项目： X01VVD01X/memex-pinterest

def urls_handler(host = None, which_collection  = "crawl-data"):
    """Put together host documents for use with hosts endpoint """

    mmu = MemexMongoUtils(which_collection = which_collection)
    url_dics = mmu.list_urls(host = host, limit = 1000)

    for url_dic in url_dics:
        url_dic.pop("_id")
        try:
            date = url_dic["crawled_at"]
            url_dic["crawled_at"] = date.strftime("%Y-%m-%d %H:%M:%S")
        except:
            url_dic["crawled_at"] = None

    return url_dics

示例#19

0

显示文件

文件： handlers.py 项目： k9team3/memex-pinterest

def urls_handler(host = None, which_collection  = "crawl-data"):
    """Put together host documents for use with hosts endpoint """

    mmu = MemexMongoUtils(which_collection = which_collection)
    url_dics = mmu.list_urls(host = host)

    for url_dic in url_dics:
        url_dic.pop("_id")
        date = url_dic["crawled_at"]
        try:
            url_dic["crawled_at"] = date.isoformat()
        except:
            url_dic["crawled_at"] = str(date)

    return url_dics

示例#20

0

显示文件

文件： handlers.py 项目： hoardboard/memex-pinterest

def save_display(host, displayable):
    mmu = MemexMongoUtils()
    if not bool(displayable):
        for url_doc in mmu.list_urls(host=host, limit=100000000):
            mmu.set_interest(url_doc["url"], False)

    else:

        for url_doc in mmu.list_urls(host=host, limit=100000000):
            mmu.set_interest(url_doc["url"], True)

    return mmu.save_display(host, displayable)

示例#21

0

显示文件

文件： handlers.py 项目： X01VVD01X/memex-pinterest

def get_page_number_for_host(path, page_size, current_host, filter_field=None, filter_regex=None, show_all=None):

    which_collection = get_collection_by_path(path)

    mmu = MemexMongoUtils(which_collection=which_collection)

    max_page_size = 100*100 # max results per page
    host_dics = mmu.list_hosts(page=1, page_size=max_page_size, filter_field=filter_field, filter_regex=filter_regex, show_all=show_all)
    i = 0
    current_page = 0
    for host_dic in host_dics:
        if host_dic["host"] == current_host:
            current_page = (i/page_size)
            break
        else:
            i += 1

    return current_page

示例#22

0

显示文件

文件： handlers.py 项目： X01VVD01X/memex-pinterest

def save_display(host, displayable):
    mmu = MemexMongoUtils()
    if not bool(displayable):
        for url_doc in mmu.list_urls(host = host, limit=100000000):
            mmu.set_interest(url_doc["url"], False)

    else:

        for url_doc in mmu.list_urls(host = host, limit=100000000):
            mmu.set_interest(url_doc["url"], True)

    return mmu.save_display(host, displayable)

示例#23

0

显示文件

文件： handlers.py 项目： hoardboard/memex-pinterest

def mark_interest_handler(interest, url):

    mmu = MemexMongoUtils()
    if interest:
        mmu.set_interest(url, True)

    else:
        #if user marks url as uninteresting, score drops to 0
        mmu.set_interest(url, False)

示例#24

0

显示文件

文件： handlers.py 项目： hoardboard/memex-pinterest

def save_blur_level(level):
    mmu = MemexMongoUtils()
    mmu.save_blur_level(level)

示例#25

0

显示文件

文件： handlers.py 项目： X01VVD01X/memex-pinterest

def save_search_term(list):
    mmu = MemexMongoUtils()
    mmu.save_search_term(list)

示例#26

0

显示文件

文件： handlers.py 项目： hoardboard/memex-pinterest

def save_search_term(list):
    mmu = MemexMongoUtils()
    mmu.save_search_term(list)

示例#27

0

显示文件

文件： handlers.py 项目： hoardboard/memex-pinterest

def get_score_handler():

    mmu = MemexMongoUtils()
    yes_interest_docs = mmu.list_all_urls_with_interest(True, return_html=True)
    no_interest_docs = mmu.list_all_urls_with_interest(False, return_html=True)
    return yes_interest_docs, no_interest_docs

示例#28

0

显示文件

文件： handlers.py 项目： hoardboard/memex-pinterest

def set_workspace_selected(id):
    mmu = MemexMongoUtils()
    mmu.set_workspace_selected(id)

示例#29

0

显示文件

文件： handlers.py 项目： X01VVD01X/memex-pinterest

def get_score_handler():

    mmu = MemexMongoUtils()
    yes_interest_docs = mmu.list_all_urls_with_interest(True, return_html = True)
    no_interest_docs = mmu.list_all_urls_with_interest(False, return_html = True)
    return yes_interest_docs, no_interest_docs

示例#30

0

显示文件

文件： handlers.py 项目： X01VVD01X/memex-pinterest

def add_workspace(name):
    mmu = MemexMongoUtils()
    mmu.add_workspace(name)

示例#31

0

显示文件

文件： handlers.py 项目： hoardboard/memex-pinterest

def list_workspace():
    mmu = MemexMongoUtils()
    return mmu.list_workspace()

示例#32

0

显示文件

文件： handlers.py 项目： X01VVD01X/memex-pinterest

def delete_workspace(id):
    mmu = MemexMongoUtils()
    mmu.delete_workspace(id)

示例#33

0

显示文件

文件： handlers.py 项目： X01VVD01X/memex-pinterest

def set_workspace_selected(id):
    mmu = MemexMongoUtils()
    mmu.set_workspace_selected(id)

示例#34

0

显示文件

文件： handlers.py 项目： X01VVD01X/memex-pinterest

def list_keyword():
    mmu = MemexMongoUtils()
    return mmu.list_keyword()

示例#35

0

显示文件

文件： handlers.py 项目： X01VVD01X/memex-pinterest

def save_blur_level(level):
    mmu = MemexMongoUtils()
    mmu.save_blur_level(level)

示例#36

0

显示文件

文件： handlers.py 项目： X01VVD01X/memex-pinterest

def get_blur_level():
    mmu = MemexMongoUtils()
    return mmu.get_blur_level()

示例#37

0

显示文件

文件： handlers.py 项目： X01VVD01X/memex-pinterest

def save_tags(host, tags):
    mmu = MemexMongoUtils()
    mmu.save_tags(host, tags)

示例#38

0

显示文件

文件： handlers.py 项目： X01VVD01X/memex-pinterest

def list_workspace():
    mmu = MemexMongoUtils()
    return mmu.list_workspace()

示例#39

0

显示文件

文件： handlers.py 项目： X01VVD01X/memex-pinterest

def list_tags(host):
    mmu = MemexMongoUtils()
    return mmu.list_tags(host)

示例#40

0

显示文件

文件： handlers.py 项目： hoardboard/memex-pinterest

def list_keyword():
    mmu = MemexMongoUtils()
    return mmu.list_keyword()

示例#41

0

显示文件

文件： handlers.py 项目： hoardboard/memex-pinterest

def add_workspace(name):
    mmu = MemexMongoUtils()
    mmu.add_workspace(name)

示例#42

0

显示文件

文件： handlers.py 项目： X01VVD01X/memex-pinterest

def set_score_handler(url, score):
    mmu = MemexMongoUtils()
    mmu.set_score(url, score)

示例#43

0

显示文件

文件： handlers.py 项目： hoardboard/memex-pinterest

def delete_workspace(id):
    mmu = MemexMongoUtils()
    mmu.delete_workspace(id)

示例#44

0

显示文件

文件： handlers.py 项目： X01VVD01X/memex-pinterest

def save_keyword(list):
    mmu = MemexMongoUtils()
    mmu.save_keyword(list)

示例#45

0

显示文件

文件： handlers.py 项目： hoardboard/memex-pinterest

def save_keyword(list):
    mmu = MemexMongoUtils()
    mmu.save_keyword(list)

示例#46

0

显示文件

文件： handlers.py 项目： hoardboard/memex-pinterest

def discovery_handler():

    mmu = MemexMongoUtils()
    seeds = mmu.list_seeds()
    return seeds

示例#47

0

显示文件

文件： handlers.py 项目： hoardboard/memex-pinterest

def list_search_term():
    mmu = MemexMongoUtils()
    return mmu.list_search_term()

示例#48

0

显示文件

文件： handlers.py 项目： hoardboard/memex-pinterest

def set_score_handler(url, score):
    mmu = MemexMongoUtils()
    mmu.set_score(url, score)

示例#49

0

显示文件

文件： handlers.py 项目： X01VVD01X/memex-pinterest

def discovery_handler():

    mmu = MemexMongoUtils()
    seeds = mmu.list_seeds()
    return seeds

示例#50

0

显示文件

文件： handlers.py 项目： hoardboard/memex-pinterest

def list_tags(host):
    mmu = MemexMongoUtils()
    return mmu.list_tags(host)

示例#51

0

显示文件

文件： handlers.py 项目： hoardboard/memex-pinterest

def get_blur_level():
    mmu = MemexMongoUtils()
    return mmu.get_blur_level()

示例#52

0

显示文件

文件： handlers.py 项目： hoardboard/memex-pinterest

def save_tags(host, tags):
    mmu = MemexMongoUtils()
    mmu.save_tags(host, tags)

示例#53

0

显示文件

文件： handlers.py 项目： hoardboard/memex-pinterest

def hosts_handler(page=1,
                  page_size=10,
                  current_host=None,
                  which_collection="crawl-data",
                  filter_field=None,
                  filter_regex=None,
                  show_all=None):
    """Put together host documents for use with hosts endpoint """

    mmu = MemexMongoUtils(which_collection=which_collection)
    # for host in mmu.get_hosts_filtered(filter_field = "host", filter_regex = "windows"):
    #     print "b"
    #     print host

    khc = KnownHostsCompare()

    if current_host:
        current_page_size = page_size_max = 10 * 100  # max results per page
        host_dics = mmu.list_hosts(page=page,
                                   page_size=current_page_size,
                                   filter_field=filter_field,
                                   filter_regex=filter_regex,
                                   show_all=show_all)
        matched = False
        i = 0
        for host_dic in host_dics:
            i += 1
            if host_dic["host"] == current_host:
                matched = True
            if matched and (i % page_size == 0):
                break

        # clean the leftovers
        n = len(host_dics)
        for x in range(i, n):
            host_dics.pop()

    else:
        host_dics = mmu.list_hosts(page=page,
                                   page_size=page_size,
                                   filter_field=filter_field,
                                   filter_regex=filter_regex,
                                   show_all=show_all)

    for host_dic in host_dics:

        #print host_dic
        #host scoring is added here as is known hostedness
        host_dic.pop("_id")
        is_known_host = khc.is_known_host(host_dic["host"])
        host_dic["is_known_host"] = is_known_host
        hsu = mmu.get_highest_scoring_url_with_screenshot(host_dic["host"])
        #host_score = mmu.get_host_score(host_dic["host"])
        #host_dic["host_score"] = host_score

        if hsu:
            screenshot_path = get_screenshot_relative_path(
                hsu['screenshot_path'])
            host_dic["hsu_screenshot_path"] = screenshot_path
        else:
            host_dic["hsu_screenshot_path"] = None

    return host_dics

示例#54

0

显示文件

文件： handlers.py 项目： X01VVD01X/memex-pinterest

def list_search_term():
    mmu = MemexMongoUtils()
    return mmu.list_search_term()