def get_dataset_info(filename, filetype, save_file=None, sent_limit=100, ques_limit=50, answer_limit=30, is_clue_topN=20, debug=False, debug_length=20): raw_examples = FQG_data.get_raw_examples(filename, filetype, debug, debug_length) examples_with_info = [] for i in range(len(raw_examples)): e = raw_examples[i] sentence = e["ans_sent"] question = e["question"] answer = e["answer_text"] answer_start = e["answer_start"] new_e = get_answer_clue_style_info(sentence, question, answer, answer_start, sent_limit, ques_limit, answer_limit, is_clue_topN) examples_with_info.append(new_e) # print(new_e) #!!! if debug and i >= debug_length: break if save_file is None: save_file = file_type + "_answer_clue_style_info.pkl" save(save_file, examples_with_info) return examples_with_info
def sentences2augmented_sentences(input_path, output_path, start_index, end_index, sample_probs, num_sample_answer=5, num_sample_clue=2, num_sample_style=2, max_sample_times=20): augmented_sentences = [] with codecs.open(input_path, "r", encoding='utf8') as infile: sentences = infile.readlines() assert start_index < end_index assert start_index < len(sentences) assert end_index <= len(sentences) print("Start augment data...") for i in range(start_index, end_index): print(i) s_split = sentences[i].rstrip().split("\t") pid = s_split[0] sid = s_split[1] s = s_split[2] # augmented_s = augment_qg_data(s) # NOTICE: for FQG_data_augmentor_old augmented_s = augment_qg_data(s, sample_probs, num_sample_answer, num_sample_clue, num_sample_style, max_sample_times) augmented_s["pid"] = pid augmented_s["sid"] = sid augmented_sentences.append(augmented_s) save(output_path, augmented_sentences, "save augmented sentences...") infile.close()
def get_answertag2qtype_mapping(answertag2qtype_dict_file, data_file, data_type): """ Get the mapping between (answer_tags, potential question types). We either load a saved dictionary which we calculated and saved before, or we create such a dict by analyzing reference_file and save it for future usage. :param answertag2qtype_dict_file: we will save the result to this file. :param data_file: such as SQuAD data file. We use it to get the mapping. :param data_type: SQuAD or NewsQA. See get_raw_examples in FQG_data.py :return: a dict maps answer text tags (from the function get_answer_chunk_tags) to question types set. """ examples = get_raw_examples(data_file, data_type) answertag2qtype = {} i = 0 for e in examples: try: context_text = e["ans_sent"] answer_start = e["answer_start"] answer_text = e["answer_text"] answer_end = e["answer_start"] + len(answer_text) - 1 question = e["question"] chunk_tag = get_answer_chunk_tag(context_text, answer_start, answer_end) ner_tag = get_answer_ner_tag(context_text, answer_text) answertag = "-".join([chunk_tag, ner_tag]) qtype, qtype_id = get_question_type(question) if answertag in answertag2qtype: answertag2qtype[answertag].append(qtype) else: answertag2qtype[answertag] = [qtype] except: continue i = i + 1 print(i) # if i > 20: # break # for debug answertag2qtype_set = {} answertag2qtype_counter = {} for k in answertag2qtype: answertag2qtype_set[k] = set(answertag2qtype[k]) answertag2qtype_counter[k] = Counter(answertag2qtype[k]) result = { "answertag2qtype": answertag2qtype, "answertag2qtype_set": answertag2qtype_set, "answertag2qtype_counter": answertag2qtype_counter } save(answertag2qtype_dict_file, result, message="save answertag2qtype dict") print(answertag2qtype_set) print(answertag2qtype_counter) return answertag2qtype_set
def get_sample_probs(filename, filetype, save_dataset_info_file=None, save_sample_probs_file=None, sent_limit=100, ques_limit=50, answer_limit=30, is_clue_topN=20, debug=False, debug_length=20, answer_length_bin_width=3, answer_length_min_val=0, answer_length_max_val=30, clue_dep_dist_bin_width=2, clue_dep_dist_min_val=0, clue_dep_dist_max_val=100): # !!!!!!!!!!!!! maybe 20 makes the prob be big for clue_dep > 20... set it as a big value like 100? """ P(a, c, s) = p(a) * p(c|a) * p(s|c, a) = p(a|a_tag, a_length) * p(c|c_tag, dep_dist) * p(s|a_tag) """ examples_with_info = get_dataset_info( filename, filetype, save_dataset_info_file, sent_limit, ques_limit, answer_limit, is_clue_topN, debug, debug_length) sla_tag = [] # for p(s|a_tag). here we use "l" to denote "|" clc_tag_dep_dist = [] # for p(c|c_tag, dep_dist). here we use "l" to denote "|" ala_tag_a_length = [] # for p(a|a_tag, a_length). here we use "l" to denote "|" print(f"\n[DEBUG] in get_sample_probs, examples: {len(examples_with_info)}") utu = 0 for e in examples_with_info: if utu <= 10 or (utu <= 100 and utu % 10 == 0) or (utu <= 1000 and utu % 100 == 0) or (utu % 1000 == 0): print(f"[DEBUG] {utu}/{len(examples_with_info)}") a_tag = "-".join([e["answer_pos_tag"], e["answer_ner_tag"]]) # answer tag s = e["question_type"][0] # question style (type) a_length = e["answer_length"] a_length_bin = val2bin(a_length, answer_length_min_val, answer_length_max_val, answer_length_bin_width) c_tag = "-".join([e["clue_pos_tag"], e["clue_ner_tag"]]) dep_dist = e["clue_answer_dep_path_len"] dep_dist_bin = val2bin(dep_dist, clue_dep_dist_min_val, clue_dep_dist_max_val, clue_dep_dist_bin_width) sla_tag.append("_".join([s, a_tag])) clc_tag_dep_dist.append("_".join([c_tag, str(dep_dist_bin)])) ala_tag_a_length.append("_".join([a_tag, str(a_length_bin)])) sla_tag = Counter(sla_tag) clc_tag_dep_dist = Counter(clc_tag_dep_dist) ala_tag_a_length = Counter(ala_tag_a_length) sample_probs = { "a": ala_tag_a_length, "c|a": clc_tag_dep_dist, "s|c,a": sla_tag} if save_sample_probs_file is None: save_sample_probs_file = filetype + "_sample_probs.pkl" save(save_sample_probs_file, sample_probs) print("\n[DEBUG] return probs") return sample_probs
def prepro(config, augmented_sentences_pkl_file, processed_augmented_sentences_pkl_file): debug = config.debug debug_length = config.debug_batchnum * config.batch_size # get train spacy processed examples and counters examples = load(augmented_sentences_pkl_file) examples = get_spacy_processed_examples(config, examples, debug, debug_length, shuffle=False) # get emb_mats and emb_dicts emb_dicts = load(config.emb_dicts_file) # get featured examples examples = get_featured_examples(config, examples, emb_dicts) save(processed_augmented_sentences_pkl_file, examples, message="processed_augmented_sentences_pkl_file")
def prepro(config): emb_tags = config.emb_config.keys() emb_config = config.emb_config emb_mats = {} emb_dicts = {} debug = config.debug debug_length = config.debug_batchnum * config.batch_size # get train spacy processed examples and counters if not config.processed_by_spacy and not config.processed_example_features: train_examples = get_raw_examples(config.train_file, config.data_type, debug, debug_length) train_examples, train_meta, train_eval = get_spacy_processed_examples( config, train_examples, debug, debug_length, shuffle=False) dev_examples = get_raw_examples(config.dev_file, config.data_type, debug, debug_length) dev_examples, dev_meta, dev_eval = get_spacy_processed_examples( config, dev_examples, debug, debug_length, shuffle=False) test_examples = get_raw_examples(config.test_file, config.data_type, debug, debug_length) test_examples, test_meta, test_eval = get_spacy_processed_examples( config, test_examples, debug, debug_length, shuffle=False) counters = get_updated_counters_by_examples(config, None, train_examples, increment=1, init=True, finish=True) # only use train data final_counters = copy.deepcopy(counters) save(config.train_examples_file, train_examples, message="train examples") save(config.dev_examples_file, dev_examples, message="dev examples") save(config.test_examples_file, test_examples, message="test examples") save(config.train_meta_file, train_meta, message="train meta") save(config.dev_meta_file, dev_meta, message="dev meta") save(config.test_meta_file, test_meta, message="test meta") save(config.train_eval_file, train_eval, message="train eval") save(config.dev_eval_file, dev_eval, message="dev eval") save(config.test_eval_file, test_eval, message="test eval") save(config.counters_file, final_counters, message="counters") else: train_examples = load(config.train_examples_file) train_meta = load(config.train_meta_file) train_eval = load(config.train_eval_file) dev_examples = load(config.dev_examples_file) dev_meta = load(config.dev_meta_file) dev_eval = load(config.dev_eval_file) test_examples = load(config.test_examples_file) test_meta = load(config.test_meta_file) test_eval = load(config.test_eval_file) final_counters = load(config.counters_file) counters = final_counters # get emb_mats and emb_dicts if not config.processed_emb: for tag in emb_tags: emb_mats[tag], emb_dicts[tag] = get_embedding( final_counters[tag], tag, emb_file=emb_config[tag]["emb_file"], size=emb_config[tag]["emb_size"], vec_size=emb_config[tag]["emb_dim"]) save(config.emb_mats_file, emb_mats, message="embedding mats") save(config.emb_dicts_file, emb_dicts, message="embedding dicts") else: emb_mats = load(config.emb_mats_file) emb_dicts = load(config.emb_dicts_file) for k in emb_dicts: print("Embedding dict length: " + k + " " + str(len(emb_dicts[k]))) # get related_words_dict and related_words_ids_mat if not config.processed_related_words: related_words_dict = get_related_words_dict( list(emb_dicts["word"].keys()), config.max_topN) related_words_ids_mat = get_related_words_ids_mat_with_related_words_dict( emb_dicts["word"], config.max_topN, related_words_dict) save(config.related_words_dict_file, related_words_dict, message="related words dict") save(config.related_words_ids_mat_file, related_words_ids_mat, message="related words ids mat") else: related_words_dict = load(config.related_words_dict_file) related_words_ids_mat = load(config.related_words_ids_mat_file) # get featured examples # TODO: handle potential insert SOS EOS problem when extracting tag features if not config.processed_example_features: train_examples, train_meta = get_featured_examples( config, train_examples, train_meta, "train", emb_dicts, related_words_ids_mat, related_words_dict) dev_examples, dev_meta = get_featured_examples(config, dev_examples, dev_meta, "dev", emb_dicts, related_words_ids_mat, related_words_dict) test_examples, test_meta = get_featured_examples( config, test_examples, test_meta, "test", emb_dicts, related_words_ids_mat, related_words_dict) save(config.train_examples_file, train_examples, message="train examples") save(config.dev_examples_file, dev_examples, message="dev examples") save(config.test_examples_file, test_examples, message="test examples") save(config.train_meta_file, train_meta, message="train meta") save(config.dev_meta_file, dev_meta, message="dev meta") save(config.test_meta_file, test_meta, message="test meta") save(config.train_eval_file, train_eval, message="train eval") save(config.dev_eval_file, dev_eval, message="dev eval") save(config.test_eval_file, test_eval, message="test eval") else: train_examples = load(config.train_examples_file) train_meta = load(config.train_meta_file) train_eval = load(config.train_eval_file) dev_examples = load(config.dev_examples_file) dev_meta = load(config.dev_meta_file) dev_eval = load(config.dev_eval_file) test_examples = load(config.test_examples_file) test_meta = load(config.test_meta_file) test_eval = load(config.test_eval_file) # print to txt to debug """
def prepro(config): emb_tags = config.emb_tags emb_config = config.emb_config emb_mats = {} emb_dicts = {} debug = config.debug debug_length = config.debug_batchnum * config.batch_size # get examples and counters if not config.processed_example_features: examples = get_raw_examples(config, config.train_file, debug, debug_length) examples = get_featured_examples(config, examples) counters = get_counters(examples, config.emb_tags, config.emb_not_count_tags) save(config.train_examples_file, (examples, 0), message="examples") save(config.counters_file, counters, message="counters") else: examples, num_relations = load(config.train_examples_file) counters = load(config.counters_file) # get emb_mats and emb_dicts if not config.processed_emb: for tag in emb_tags: emb_mats[tag], emb_dicts[tag] = get_embedding( counters[tag], tag, emb_file=emb_config[tag]["emb_file"], size=emb_config[tag]["emb_size"], vec_size=emb_config[tag]["emb_dim"]) save(config.emb_mats_file, emb_mats, message="embedding mats") save(config.emb_dicts_file, emb_dicts, message="embedding dicts") else: emb_mats = load(config.emb_mats_file) emb_dicts = load(config.emb_dicts_file) for k in emb_dicts: print("Embedding dict length: " + k + " " + str(len(emb_dicts[k]))) if not config.processed_example_graph_features: # NOTICE: we should set update_edge_types2ids = True only for train dataset #if config.processed_emb and "edge_types" in emb_dicts: # edge_types2ids = emb_dicts["edge_types"] #else: edge_types2ids = {} examples, num_relations, edge_types2ids = get_graph_examples( config, examples, config.edge_types_list, emb_dicts, edge_types2ids, update_edge_types2ids=True) emb_dicts["edge_types"] = edge_types2ids save(config.train_examples_file, (examples, num_relations), message="examples") save(config.emb_dicts_file, emb_dicts, message="embedding dicts") # print to txt to debug for k in emb_dicts: write_dict(emb_dicts[k], OUTPUT_PATH + "debug/emb_dicts_" + str(k) + ".txt") for k in counters: write_counter(counters[k], OUTPUT_PATH + "debug/counters_" + str(k) + ".txt") write_example(examples[5], OUTPUT_PATH + "debug/example.txt")