Пример #1
0
def register_query_pages(query_id, query_str, cursor, ntcir_urls_folder, ntcir_htmls_folder, cache_folder):
    cache_files_folder = "%s/pages" % (cache_folder,)

    qid = web_search.add_query(cursor, query_str, 'NTCIR')
    rank = 0

    for file_name in file("%s/%s.MAND.tsv" % (ntcir_urls_folder, query_id,)):
        full_name = "%s/%s" % (ntcir_htmls_folder, file_name.strip())
        url = "file://%s" % (full_name,)
        rid = web_search.copy_existing_page(cursor, cache_files_folder, full_name, url)
        web_search.ensure_page_query_link(cursor, cache_files_folder, qid, rank, url)
        rank += 1
Пример #2
0
def register_query_pages(query_id, query_str, cursor, ntcir_urls_folder,
                         ntcir_htmls_folder, cache_folder):
    cache_files_folder = "%s/pages" % (cache_folder, )

    qid = web_search.add_query(cursor, query_str, 'NTCIR')
    rank = 0

    for file_name in file("%s/%s.MAND.tsv" % (
            ntcir_urls_folder,
            query_id,
    )):
        full_name = "%s/%s" % (ntcir_htmls_folder, file_name.strip())
        url = "file://%s" % (full_name, )
        rid = web_search.copy_existing_page(cursor, cache_files_folder,
                                            full_name, url)
        web_search.ensure_page_query_link(cursor, cache_files_folder, qid,
                                          rank, url)
        rank += 1
Пример #3
0
    (conn, cursor) = web_search.open_db(cache_folder)

    if web_search.find_query(cursor, query_str, search_engine) is not None:
        sys.exit("Query already in index")

    cache_files_folder = "%s/pages" % (cache_folder, )
    print "page cache folder", cache_files_folder
    try:
        os.mkdir(cache_files_folder)
        print "creating page cache folder", cache_files_folder
    except object as exc:
        print "(warning) problem creating", cache_files_folder, exc
    except OSError:
        pass

    qid = web_search.add_query(cursor, query_str, search_engine)

    rank = 0
    found = set()
    for url in file(html_file):
        url = url.replace('\n', '')
        if url in found:
            continue
        found.add(url)
        web_search.ensure_page_query_link(cursor, cache_files_folder, qid,
                                          rank, url)
        conn.commit()
        rank += 1

    conn.close()
    (conn, cursor) = web_search.open_db(cache_folder)

    if web_search.find_query(cursor, query_str, search_engine) is not None:
        sys.exit("Query already in index")

    cache_files_folder = "%s/pages" % (cache_folder,)
    print "page cache folder", cache_files_folder
    try:
        os.mkdir(cache_files_folder)
        print "creating page cache folder", cache_files_folder
    except object as exc:
        print "(warning) problem creating", cache_files_folder, exc
    except OSError:
        pass

    qid = web_search.add_query(cursor, query_str, search_engine)

    rank = 0
    found = set()
    for url in file(html_file):
        url = url.replace("\n", "")
        if url in found:
            continue
        found.add(url)
        web_search.ensure_page_query_link(cursor, cache_files_folder, qid, rank, url)
        conn.commit()
        rank += 1

    conn.close()