示例#1
0
 def write(claim_ids, split_name):
     claims = get_claims_from_ids(claim_ids)
     queries = get_claims_query(claims, True)
     out_path = os.path.join(
         output_path,
         "perspective_{}_claim_query_k0.json".format(split_name))
     save_queries_to_file(queries, out_path)
示例#2
0
def main():
    pc_clusters: Iterable[PerspectiveCluster] = enum_perspective_clusters()
    tokenizer = TokenizerForGalago()

    def get_terms(text: str) -> Counter:
        terms = tokenizer.tokenize(text)
        return Counter(terms)

    # Query = [claim :: avg(perspective)]
    claim_text_d: Dict[int, str] = get_all_claim_d()
    perspective_text_d: Dict[int, str] = get_perspective_dict()

    def cluster_to_query(cluster: PerspectiveCluster) -> DocQuery:
        claim_text = claim_text_d[cluster.claim_id]
        perspective_text_list = list(
            [perspective_text_d[pid] for pid in cluster.perspective_ids])
        query_id = get_pc_cluster_query_id(cluster)
        claim_tf: Counter = get_terms(claim_text)
        pers_tf: Counter = average_counters(
            lmap(get_terms, perspective_text_list))
        tf = sum_counters([claim_tf, pers_tf])
        query: DocQuery = counter_to_galago_query(query_id, tf)
        return query

    query_list: List[DocQuery] = lmap(cluster_to_query, pc_clusters)
    print(len(query_list))
    out_path = os.path.join(output_path, "perspective_query",
                            "pc_query_for_evidence.json")
    save_queries_to_file(query_list, out_path)
示例#3
0
def write_queries_to_files(n_query_per_file, out_dir, queries: List[DocQuery]):
    i = 0
    while i * n_query_per_file < len(queries):
        st = i * n_query_per_file
        ed = (i + 1) * n_query_per_file
        out_path = os.path.join(out_dir, "{}.json".format(i))
        save_queries_to_file(queries[st:ed], out_path)
        i += 1
示例#4
0
def write_simple_claim_queries():
    for split in splits:
        claim_ids = load_claim_ids_for_split(split)
        claims = get_claims_from_ids(claim_ids)
        queries = get_simple_claim_query(claims, True)
        out_path = os.path.join(output_path, "perspective_query",
                                "simple_query_{}.json".format(split))
        save_queries_to_file(queries, out_path)
示例#5
0
def work(years, query_type, save_path):
    queries = load_queries(years)

    def convert_query(q):
        return trec_query_to_galago_query(q, query_type)

    queries = lmap(convert_query, queries)
    save_queries_to_file(queries, save_path)
示例#6
0
def xml_query_to_json(xml_path, json_path):
    queries: List[Query] = load_xml_query(xml_path)

    def transform(q: Query) -> Dict:
        tokens = word_tokenize(q.text)
        tokens = clean_query(tokens)
        return format_query_bm25(q.qid, tokens)

    queries_dict_list: List[Dict] = lmap(transform, queries)
    save_queries_to_file(queries_dict_list, json_path)
示例#7
0
def write_queries(split: str, queries: List[DocQuery]):
    root_dir_path = os.path.join(job_man_dir, "counter_arg_queries")
    exist_or_mkdir(root_dir_path)
    dir_path = os.path.join(root_dir_path, split)
    exist_or_mkdir(dir_path)
    query_per_file = 50
    file_idx = 0
    while file_idx * query_per_file < len(queries):
        save_path = os.path.join(dir_path, str(file_idx) + ".json")
        st = file_idx * query_per_file
        ed = st + query_per_file
        queries_to_save = queries[st:ed]
        save_queries_to_file(queries_to_save, save_path)
        file_idx += 1
示例#8
0
def write_claim_as_query():
    d_ids = list(load_train_claim_ids())
    claims = get_claims_from_ids(d_ids)
    queries = []
    for c in claims:
        cid = c["cId"]
        claim_text = c["text"]
        tokens = claim_text.split()
        query_text = clean_query(tokens)
        print(query_text)
        q_entry = get_query_entry_bm25_anseri(cid, query_text)
        queries.append(q_entry)

    out_path = os.path.join(output_path, "perspective_dev_claim_query.json")
    save_queries_to_file(queries, out_path)
示例#9
0
def main():
    split = "dev"
    stopword = load_stopwords_for_query()
    # split = "train"
    ex_info_dir = "/mnt/nfs/work3/youngwookim/job_man/pc_rm_terms_{}".format(
        split)
    query_path = os.path.join(
        output_path, "perspective_{}_claim_query_k0_fixed.json".format(split))
    queries = load_queries(query_path)
    ex_w_scale = 100
    out_path = os.path.join(output_path, "perspective_query",
                            "pc_{}_claim_query_rm_ex.json".format(split))
    ##
    new_queries = get_extended(ex_info_dir, ex_w_scale, queries, stopword)
    save_queries_to_file(new_queries, out_path)
示例#10
0
def main():
    print("Start")
    spr = StreamPickleReader("robust_candi_query_")
    query_per_task = 1000 * 10
    out_idx = 0
    while spr.has_next():
        queries = []
        for i in range(query_per_task):
            if not spr.has_next():
                break
            q_id, query = spr.get_item()
            query = clean_query(query)
            queries.append(get_query_entry(q_id, query))

        out_path = os.path.join(cpath.output_path, "query",
                                "g_query_{}.json".format(out_idx))
        save_queries_to_file(queries, out_path)
        out_idx += 1
示例#11
0
def send_queries_inner(index_path, num_result, queries, timeout) -> List[str]:
    query_path = get_new_query_json_path()
    # save query to file
    save_queries_to_file(queries, query_path)
    # issue galago command
    cmd = [
        "galago", "threaded-batch-search", "--requested=" + str(num_result),
        "--index=" + index_path, query_path
    ]
    os.environ['PYTHONUNBUFFERED'] = "1"
    temp_outpath = query_path + ".output"
    out_file = open(temp_outpath, "w")
    proc = subprocess.Popen(
        cmd,
        stdout=out_file,
        stderr=PIPE,
        universal_newlines=True,
    )
    # wait , read pipe
    prev_num_remain = 999999
    last_update_time = time.time()
    try:
        while proc.poll() is None:
            line = proc.stderr.readline()
            if line.startswith("INFO: Still running..."):
                st = len("INFO: Still running...")
                tokens = line[st:].split()
                num_remain = int(tokens[0])
                if num_remain != prev_num_remain:
                    print(line, end='')
                    prev_num_remain = num_remain
                    last_update_time = time.time()

            if time.time() - last_update_time > timeout:
                break
    except subprocess.TimeoutExpired:
        proc.kill()
    out_file.close()

    file_content = open(temp_outpath, "r").read()
    lines: List[str] = file_content.splitlines()
    return lines