예제 #1
0
파일: practice.py 프로젝트: clover3/Chair
def main():
    clean_doc_sample: Dict[str, Tuple] = load_from_pickle(
        "clean_clueweb_doc_sample")
    doc_json_list = lmap(
        json.loads,
        open(at_output_dir("clueweb", "doc_content_samples.json"), "r"))
    d = difflib.Differ()
    html_diff = difflib.HtmlDiff()

    clean_fn = text_from_html
    for doc_json in doc_json_list[1:]:
        html = doc_json['content']
        open(at_output_dir("visualize", "text.html"), "w",
             encoding="utf-8").write(html)
        doc_id = doc_json['id']
        title, cleaned_text_ref = clean_doc_sample[doc_id]
        cleaned_text = clean_fn(html)
        # print(cleaned_text_ref)
        # print(cleaned_text)

        tokens_ref = nltk.tokenize.wordpunct_tokenize(cleaned_text_ref)
        tokens = nltk.tokenize.wordpunct_tokenize(cleaned_text)
        print(" ".join(tokens_ref))
        print(" ".join(tokens))
        diff = d.compare(tokens_ref, tokens)
        break
예제 #2
0
def main():
    html_template_path = os.path.join(data_path, "med_contradiction",
                                      "annotation", "annotation_template.html")
    input_csv_path = at_output_dir("alamri_pilot", "pilot_pairs.csv")
    html_out_dir = at_output_dir("alamri_pilot", "pilot_pairs_html")
    csv_link_output = at_output_dir("alamri_pilot", "pilot_links.csv")

    generate(input_csv_path, html_template_path, html_out_dir, csv_link_output)
예제 #3
0
def main():
    work(range(2009, 2013), KEYWORD_QUERY,
         at_output_dir("clueweb", "keyword_09b_query.json"))
    work(range(2009, 2013), DESC_QUERY,
         at_output_dir("clueweb", "desc_09b_query.json"))
    work(range(2013, 2015), KEYWORD_QUERY,
         at_output_dir("clueweb", "keyword_12b_query.json"))
    work(range(2013, 2015), DESC_QUERY,
         at_output_dir("clueweb", "desc_12b_query.json"))
예제 #4
0
def generate_and_write(file_name, generate_fn, tokenizer):
    data_id_man = DataIDManager()
    inst_list = generate_fn(data_id_man)
    max_seq_length = 300
    save_path = at_output_dir("alamri_tfrecord", file_name)
    encode_fn = get_encode_fn(max_seq_length, tokenizer)
    write_records_w_encode_fn(save_path, encode_fn, inst_list)
    info_save_path = at_output_dir("alamri_tfrecord", file_name + ".info")
    json.dump(data_id_man.id_to_info, open(info_save_path, "w"))
예제 #5
0
def show_high():
    info_save_path = at_output_dir("clue_counter_arg", "clue_f5.tfrecord.info")
    info = json.load(open(info_save_path, "r"))
    # prediction_file = at_output_dir("clue_counter_arg", "ada_aawd4_clue.4000.score")
    prediction_file = at_output_dir("clue_counter_arg",
                                    "ada_aawd5_clue.4000.score")
    pred_data = join_prediction_with_info(prediction_file, info)

    for idx, e in enumerate(pred_data):
        score = logit_to_score_softmax(e['logits'])
        if int(score * 100) == 13:
            print(e['text'])
예제 #6
0
def get_f5_tids_score_d_from_bert():
    info_save_path = at_output_dir("clue_counter_arg", "clue_f5.tfrecord.info")
    info = json.load(open(info_save_path, "r"))

    prediction_file = at_output_dir("clue_counter_arg",
                                    "ada_aawd5_clue.4000.score")
    pred_data = join_prediction_with_info(prediction_file, info)
    score_d = {}

    for idx, e in enumerate(pred_data):
        score = logit_to_score_softmax(e['logits'])
        text = e['text']
        score_d[text] = score
    return score_d
 def __init__(self, queries, qid_list, probe_config):
     self.long_seg_score_path_format = at_output_dir("rqd", "rqd_{}.score")
     self.short_seg_score_path_format = at_output_dir("rqd", "rqd_sm_{}.score")
     info_file_path = at_output_dir("robust", "seg_info")
     f_handler = get_format_handler("qc")
     self.f_handler = f_handler
     self.info: Dict = load_combine_info_jsons(info_file_path, f_handler.get_mapping(), f_handler.drop_kdp())
     self.doc_piece_score_d: Dict[Tuple[str, str], List[ScoredPieceFromPair]] = {}
     self.prepared_qids = set()
     self.probe_config = probe_config
     self.queries = queries
     self.tokenizer = get_tokenizer()
     self.qid_list: List[str] = qid_list
     self.not_found_cnt = 0
예제 #8
0
def main():
    num_layers = 12
    dva = DictValueAverage()

    all_val = defaultdict(list)
    for i in range(1):
        save_path = at_output_dir("lms_scores", str(i) + ".pickle")
        output_d = load_pickle_from(save_path)
        input_mask = output_d['input_mask']  # [num_inst, seq_length]
        for layer_no in range(num_layers):
            probs = sigmoid(
                output_d['logits'][layer_no])  # [num_inst, seq_length, 2]
            num_inst, seq_length, maybe_2 = np.shape(probs)

            for data_idx in range(num_inst):
                for seq_idx in range(seq_length):
                    if input_mask[data_idx, seq_idx]:
                        key = layer_no
                        v = probs[data_idx, seq_idx, 1]
                        dva.add(key, v)
                        all_val[key].append(v)

    for k, v in dva.all_average().items():
        print(k, v)

    for k, l in all_val.items():
        min_val = max(l)
        print(k, min_val)
예제 #9
0
def main():
    saved_dir = at_output_dir("perspective_experiments", "clueweb_qres")
    path1 = os.path.join(saved_dir, "train.txt")
    path2 = os.path.join(saved_dir, "dev.txt")

    rlg1 = load_ranked_list_grouped(path1)
    rlg2 = load_ranked_list_grouped(path2)
    k = 10

    most_common = []
    for query_id1 in rlg1:
        for query_id2 in rlg2:
            top_k_docs1 = lmap(TrecRankedListEntry.get_doc_id,
                               rlg1[query_id1][:k])
            top_k_docs2 = lmap(TrecRankedListEntry.get_doc_id,
                               rlg2[query_id2][:k])
            common = set(top_k_docs1).intersection(top_k_docs2)
            percent_common = len(common) / k
            if percent_common > 0.1:
                most_common.append((percent_common, query_id1, query_id2))

    most_common.sort(key=get_first, reverse=True)

    for rate_common, qid1, qid2 in most_common[:10]:
        print(rate_common, qid1, qid2)
예제 #10
0
def main():
    pc_data: List[Dict] = load_claim_perspective_pair()

    pc_data.sort(key=lambda e: len(e['perspectives']), reverse=True)
    gold_d: Dict[int, List[PerspectiveCluster]] = load_perspectrum_golds()
    ca_cid = 1

    out_j = []
    for e in pc_data[:100]:
        cid = e['cId']
        if not gold_d[cid]:
            continue
        c_text = e['text']
        for pc in gold_d[cid]:
            if random.random() < 0.3:
                first_pid = pc.perspective_ids[0]
                p_text = perspective_getter(first_pid)
                j_entry = {
                    'cid': cid,
                    'claim_text': c_text,
                    'ca_cid': ca_cid,
                    'perspective': {
                        'stance': pc.stance_label_3,
                        'pid': first_pid,
                        'p_text': p_text
                    }
                }
                ca_cid += 1
                out_j.append(j_entry)
    print("total of {}".format(len(out_j)))
    out_f = open(at_output_dir("ca_building", "claims.step1.txt"),
                 "w",
                 encoding="utf-8")
    json.dump(out_j, out_f, indent=True)
예제 #11
0
 def make_tfrecord(source_name, target_name):
     source_data = data_d[source_name]
     target_data = data_d[target_name]
     combined_data = combine_source_and_target(source_data, target_data, 1)
     save_path = at_output_dir(
         dir_name, "{}_to_{}_train".format(source_name, target_name))
     write_records_w_encode_fn(save_path, encode_fn, combined_data)
예제 #12
0
def do_predict(
    bert_hp,
    train_config,
    data,
    lms_config,
    modeling_option,
    init_fn,
):
    num_gpu = train_config.num_gpu
    train_batches, dev_batches = data

    lms_model = LMSModel(modeling_option, bert_hp, lms_config, num_gpu)
    sess = init_session()
    sess.run(tf.global_variables_initializer())
    init_fn(sess)

    step_size = 100
    for i in range(100):
        st = i * step_size
        ed = st + step_size
        # make explain train_op does not increase global step
        tprint(st, ed)
        output_d = predict_fn(sess, train_batches[st:ed], lms_model.logits,
                              lms_model.loss_tensor, lms_model.ex_score_tensor,
                              lms_model.per_layer_logit_tensor,
                              lms_model.batch2feed_dict)

        save_path = at_output_dir("lms_scores", str(i))
        save_to_pickle(output_d, save_path)
예제 #13
0
def main():
    data_id_manager = DataIDManager()
    data = []
    for text in enum_f5_data():
        info = {
            'text': text,
        }
        data_id = data_id_manager.assign(info)
        label = 0
        data.append(TextInstance(text, label, data_id))

    encode_fn = get_encode_fn_w_data_id(512, False)
    save_path = at_output_dir("clue_counter_arg", "clue_f5.tfrecord")
    write_records_w_encode_fn(save_path, encode_fn, data)

    info_save_path = at_output_dir("clue_counter_arg", "clue_f5.tfrecord.info")
    json.dump(data_id_manager.id_to_info, open(info_save_path, "w"))
예제 #14
0
파일: aawd_gen.py 프로젝트: clover3/Chair
def main():
    exist_or_mkdir(os.path.join(output_path, "aawd_tfrecord"))
    train, dev, test = load_aawd_splits()
    todo = [(train, "train"), (dev, "dev"), (test, "test")]
    encode_fn = get_encode_fn(256)
    for data, split in todo:
        save_path = at_output_dir("aawd_tfrecord", split)
        write_records_w_encode_fn(save_path, encode_fn, data)
예제 #15
0
def num_files_to_touch():
    doc_id_list = readlines_strip(
        at_output_dir("clueweb", "not_found.sort.txt"))
    grouped = group_by(doc_id_list, get_doc_group)
    dir_helper = get_sydney_clueweb09_corpus_helper()
    for group_id, doc_ids in grouped.items():
        num_files = dir_helper.iter_gz_files_for_group(group_id)
        print(len(doc_ids), len(num_files))
예제 #16
0
def aawd_pred_histogram():
    prediction_file = at_output_dir("clue_counter_arg",
                                    "ada_argu3_aawd_20000.score")
    prediction_file = at_output_dir("clue_counter_arg",
                                    "ada_aawd5_clue.4000.score")
    pred_data = EstimatorPredictionViewer(prediction_file)

    def bin_fn(score):
        return str(int(score * 1000))

    bin = BinHistogram(bin_fn)
    for idx, e in enumerate(pred_data):
        score = logit_to_score_softmax(e.get_vector('logits'))
        bin.add(score)

    for i in range(101):
        key = str(i)
        if key in bin.counter:
            print(key, bin.counter[key])
예제 #17
0
def main():
    info_save_path = at_output_dir("clue_counter_arg", "clue_f5.tfrecord.info")
    info = json.load(open(info_save_path, "r"))
    prediction_file = at_output_dir("clue_counter_arg",
                                    "ada_aawd4_clue.4000.score")
    pred_data = join_prediction_with_info(prediction_file, info)

    def bin_fn(score):
        return str(int(score * 100))

    bin = BinHistogram(bin_fn)
    for idx, e in enumerate(pred_data):
        score = logit_to_score_softmax(e['logits'])
        bin.add(score)

    for i in range(101):
        key = str(i)
        if key in bin.counter:
            print(key, bin.counter[key])
예제 #18
0
def binary_gen():
    exist_or_mkdir(os.path.join(output_path, "argu_ana_tfrecord"))
    train_x, train_y, dev_x, dev_y = get_argu_pointwise_data()
    train = zip(train_x, train_y)
    dev = zip(dev_x, dev_y)
    todo = [(train, "train"), (dev, "dev")]
    encode_fn = get_encode_fn(512)
    for data, split in todo:
        save_path = at_output_dir("argu_ana_tfrecord", split)
        write_records_w_encode_fn(save_path, encode_fn, data)
예제 #19
0
def main():
    target_data_idx = int(sys.argv[1])
    info_dir = "/mnt/disks/disk100/data_info/robust_w_data_id_desc_info_pickle/"
    max_seq_length = 512
    score_and_save_dir = []
    base_model_name = "robust_3A"
    for split_idx in range(5):
        for repeat_idx in range(5):
            if target_data_idx == split_idx:
                pass
            else:
                score_dir_name = "seg_score_{}_{}_{}".format(
                    base_model_name, split_idx, repeat_idx)
                score_dir_path = at_output_dir("robust", score_dir_name)
                save_dir_path = at_output_dir("robust_seg_sel", score_dir_name)
                score_and_save_dir.append((score_dir_path, save_dir_path))

    generate_selected_training_data_for_many_runs(
        target_data_idx, info_dir, max_seq_length, score_and_save_dir,
        generate_selected_training_data)
예제 #20
0
파일: gen_to_csv.py 프로젝트: clover3/Chair
def main():
    exist_or_mkdir(os.path.join(output_path, "alamri_tfrecord"))

    data_id_manager = DataIDManager()
    entries = []
    for claim1, claim2 in enum_true_instance():
        entries.append((claim1.text, claim2.text))

    save_path = at_output_dir("alamri_pilot", "true_pairs_all.csv")
    csv_writer = csv.writer(open(save_path, "w", newline='', encoding="utf-8"))
    foreach(csv_writer.writerow, entries)
예제 #21
0
def main():
    f = open(at_output_dir("clueweb", "doc_ids_sample.txt"), "r")
    doc_ids = list([l.strip() for l in f])

    doc_contents = read_doc_id_title_text()

    new_d = {}
    for doc_id in doc_ids:
        t = doc_contents[doc_id]
        new_d[doc_id] = t

    save_to_pickle(new_d, "clean_clueweb_doc_sample")
예제 #22
0
def main():
    save_path = at_output_dir("alamri_pilot", "pilot_pairs.csv")

    entries = []
    for claim1, claim2 in enum_true_instance(3):
        print("--")
        print("{}".format(claim1.text))
        print("{}".format(claim2.text))
        entries.append((claim1.text, claim2.text))

    csv_writer = csv.writer(open(save_path, "w", newline='', encoding="utf-8"))
    foreach(csv_writer.writerow, entries)
예제 #23
0
def main():
    query_type = "desc"
    queries = load_robust_04_query(query_type)
    qid_list = get_robust_qid_list()
    tokenizer = get_tokenizer()

    f = open(at_output_dir("robust", "desc_query_len.txt"), "w")
    for qid in qid_list:
        query = queries[str(qid)]
        query_tokens = tokenizer.tokenize(query)
        n_terms = len(query_tokens)
        f.write("{}\n".format(n_terms))
    f.close()
예제 #24
0
def main():
    target_data_idx = int(sys.argv[1])
    max_seq_length = int(sys.argv[2])
    max_seg = int(sys.argv[3])
    info_path = os.path.join(job_man_dir,
                             "robust_w_data_id_desc_info_pickle",
                             "{}".format(target_data_idx))

    info = load_pickle_from(info_path)
    save_dir_path = at_output_dir("robust_seg_sel", "exact_match{}_{}".format(max_seq_length, max_seg))
    exist_or_mkdir(save_dir_path)
    get_score_fn = get_score_fn_functor()
    generate_selected_training_data(info, max_seq_length, save_dir_path, get_score_fn, max_seg)
예제 #25
0
def print_top_k():
    k = 30
    info_save_path = at_output_dir("clue_counter_arg", "clue_f5.tfrecord.info")
    info = json.load(open(info_save_path, "r"))
    prediction_file = at_output_dir("clue_counter_arg",
                                    "ada_aawd5_clue.4000.score")
    pred_data = join_prediction_with_info(prediction_file, info)

    simple_data = []

    text_set = set()
    for idx, e in enumerate(pred_data):
        score = logit_to_score_softmax(e['logits'])
        text = e['text']
        if text in text_set:
            continue
        text_set.add(text)
        simple_data.append((text, score))

    simple_data.sort(key=get_second, reverse=True)
    for text, score in simple_data[:k]:
        tab_print(score * 100, text)
예제 #26
0
def main():
    args = arg_parser.parse_args(sys.argv[1:])
    target_data_idx = int(args.target_data)
    max_seq_length = int(args.max_seq_length)
    max_seg = int(args.max_seg)
    info_path = args.info_path
    info = load_combine_info_jsons(info_path)
    save_dir_path = at_output_dir(
        "robust_seg_sel", "exact_match{}_{}".format(max_seq_length, max_seg))
    exist_or_mkdir(save_dir_path)
    get_score_fn = get_score_fn_functor()
    generate_selected_training_data_w_json(info, max_seq_length, save_dir_path,
                                           get_score_fn, max_seg)
예제 #27
0
def main():
    cr = CluewebReranking(list(range(2010, 2013)))
    all_docs = read_doc_id_title_text()

    f = open(at_output_dir("clueweb", "not_found.txt"), "w")
    not_found = 0
    for qid in cr.qrels.keys():
        for doc_id in cr.get_docs_for_training(qid):
            if doc_id not in all_docs:
                not_found += 1
                f.write("{}\n".format(doc_id))

    print("not found", not_found)
예제 #28
0
def main():
    doc_id_list = readlines_strip(
        at_output_dir("clueweb", "not_found.sort.txt"))
    grouped = group_by(doc_id_list, get_doc_group)

    todo: List[Tuple[str, List]] = list(grouped.items())
    todo.sort(key=get_first)
    num_jobs = len(grouped)

    def worker_factory(out_dir):
        return GetDocWorker(todo, out_dir)

    print("num jobs", num_jobs)
    runner = JobRunner(job_man_dir, num_jobs - 1, "get_missing_clueweb09_docs",
                       worker_factory)
    runner.start()
예제 #29
0
def read_doc_id_title_text():
    doc_id_title_text = at_output_dir("clueweb", "doc_id_title_text.txt")

    all_doc_id = []
    out_d = {}
    for line in open(doc_id_title_text, "r"):
        first_sep = line.find("[SEP]")
        second_sep = line.find("[SEP]", first_sep+1)

        sep_len = len("[SEP]")
        doc_id = line[:first_sep]
        title = line[first_sep+sep_len:second_sep]
        content = line[second_sep+sep_len:]
        all_doc_id.append(doc_id)
        out_d[doc_id] = title, content
    print("num unique docs:", len(set(all_doc_id)))
    return out_d
예제 #30
0
파일: bm25_run.py 프로젝트: clover3/Chair
def main():
    split = "dev"
    query_d = dict(load_queries(split))
    bm25_module = get_bm25_module()
    ranked_list_path = at_working_dir("msmarco-doc{}-top100".format(split))
    run_name = "BM25_df100"
    rlg = load_ranked_list_grouped(ranked_list_path)
    save_path = at_output_dir("ranked_list", "mmd_dev_{}.txt".format(run_name))
    te = TimeEstimator(100)
    out_entries = []
    for query_id, entries in rlg.items():
        doc_ids = list([e.doc_id for e in entries])
        docs = load_per_query_docs(query_id, None)

        found_doc_ids = list([d.doc_id for d in docs])
        not_found_doc_ids = list(
            [doc_id for doc_id in doc_ids if doc_id not in found_doc_ids])
        doc_id_len = len(not_found_doc_ids)
        if doc_id_len:
            print("{} docs not found".format(doc_id_len))

        query_text = query_d[QueryID(query_id)]

        def score(doc: MSMarcoDoc):
            content = doc.title + " " + doc.body
            return bm25_module.score(query_text, content)

        scored_docs = list([(d, score(d)) for d in docs])
        scored_docs.sort(key=get_second, reverse=True)

        reranked_entries = []
        for rank, (doc, score) in enumerate(scored_docs):
            e = TrecRankedListEntry(query_id, doc.doc_id, rank, score,
                                    run_name)
            reranked_entries.append(e)
        out_entries.extend(reranked_entries)
        te.tick()

        if len(out_entries) > 100 * 100:
            break

    write_trec_ranked_list_entry(out_entries, save_path)