示例#1
0
    def generate_selected_training_data_ablation_only_pos(info, key, max_seq_length, save_dir, score_dir):
        data_id_manager = DataIDManager(0, 1000000)
        out_path = os.path.join(save_dir, str(key))
        pred_path = os.path.join(score_dir, str(key))
        tprint("data gen")
        itr = enum_best_segments(pred_path, info)
        insts = []
        for selected_entry in itr:
            selected = decompress_seg_ids_entry(selected_entry)
            assert len(selected['input_ids']) == len(selected['seg_ids'])

            selected['input_ids'] = pad0(selected['input_ids'], max_seq_length)
            selected['seg_ids'] = pad0(selected['seg_ids'], max_seq_length)
            # data_id = data_id_manager.assign(selected_segment.to_info_d())
            data_id = 0
            ci = InstAsInputIds(
                selected['input_ids'],
                selected['seg_ids'],
                selected['label'],
                data_id)
            insts.append(ci)

        def encode_fn(inst: InstAsInputIds) -> collections.OrderedDict:
            return encode_inst_as_input_ids(max_seq_length, inst)

        tprint("writing")
        write_records_w_encode_fn(out_path, encode_fn, insts, len(insts))
        save_info(save_dir, data_id_manager, str(key) + ".info")
示例#2
0
def get_dict_input_features(tokenizer, max_def_length, max_d_loc, max_word_len,
                            segment_ids, dict_def, word_loc_list, dict_word):
    d_input_ids = tokenizer.convert_tokens_to_ids(dict_def)
    d_input_ids = d_input_ids[:max_def_length]
    d_input_mask = [1] * len(d_input_ids)

    if word_loc_list:
        target_segment = segment_ids[word_loc_list[0]]
        d_segment_ids = [target_segment] * len(d_input_ids)
    else:
        d_segment_ids = []

    if dict_word is not None:
        selected_word = tokenizer.convert_tokens_to_ids(dict_word.subword_rep)
    else:
        selected_word = []

    d_input_ids = pad0(d_input_ids, max_def_length)
    d_input_mask = pad0(d_input_mask, max_def_length)
    d_location_ids = pad0(word_loc_list[:max_d_loc], max_d_loc)
    d_segment_ids = pad0(d_segment_ids, max_def_length)
    selected_word = pad0(selected_word, max_word_len)

    features = collections.OrderedDict()
    features["d_input_ids"] = btd.create_int_feature(d_input_ids)
    features["d_input_mask"] = btd.create_int_feature(d_input_mask)
    features["d_segment_ids"] = btd.create_int_feature(d_segment_ids)
    features["d_location_ids"] = btd.create_int_feature(d_location_ids)
    features["selected_word"] = btd.create_int_feature(selected_word)
    return features
示例#3
0
 def encode_dict_as_feature(self, dictionary):
     new_dict = {}
     for word, dict_def in dictionary.items():
         d_input_ids = self.tokenizer.convert_tokens_to_ids(dict_def)
         d_input_ids = d_input_ids[:self.max_def_length]
         d_input_mask = [1] * len(d_input_ids)
         d_input_ids = pad0(d_input_ids, self.max_def_length)
         d_input_mask = pad0(d_input_mask, self.max_def_length)
         new_dict[word] = d_input_ids, d_input_mask
     return new_dict
示例#4
0
def generate_selected_training_data_w_json(info, max_seq_length, save_dir,
                                           get_score_fn, max_seg):
    data_id_manager = DataIDManager(0, 1000000)
    tprint("data gen")

    def get_query_id_group(query_id):
        for st, ed in robust_query_intervals:
            if st <= int(query_id) <= ed:
                return st

        assert False

    tokenizer = get_tokenizer()
    for data_id, e in info.items():
        input_ids = tokenizer.convert_tokens_to_ids(e['tokens'])
        e['input_ids'] = input_ids

    maybe_num_insts = int(len(info) / 4)
    ticker = TimeEstimator(maybe_num_insts)
    itr = enum_best_segments(get_score_fn, info, max_seg)
    insts = collections.defaultdict(list)
    for selected_entry in itr:
        ticker.tick()
        selected = selected_entry
        query_id = selected['query_id']
        q_group = get_query_id_group(query_id)
        assert len(selected['tokens']) == len(selected['seg_ids'])
        input_ids = tokenizer.convert_tokens_to_ids(selected['tokens'])
        selected['input_ids'] = pad0(input_ids, max_seq_length)
        selected['seg_ids'] = pad0(selected['seg_ids'], max_seq_length)
        # data_id = data_id_manager.assign(selected_segment.to_info_d())
        data_id = 0
        ci = InstAsInputIds(selected['input_ids'], selected['seg_ids'],
                            selected['label'], data_id)
        insts[q_group].append(ci)

    def encode_fn(inst: InstAsInputIds) -> collections.OrderedDict:
        return encode_inst_as_input_ids(max_seq_length, inst)

    tprint("writing")
    for q_group, insts_per_group in insts.items():
        out_path = os.path.join(save_dir, str(q_group))
        write_records_w_encode_fn(out_path, encode_fn, insts_per_group,
                                  len(insts_per_group))
        save_info(save_dir, data_id_manager, str(q_group) + ".info")
示例#5
0
    def encode_dict_as_feature(self, dictionary):
        new_dict = {}
        for word, entries in dictionary.items():
            enc_entries = []
            for dict_def in entries:
                d_input_ids = self.tokenizer.convert_tokens_to_ids(dict_def)
                d_input_ids = d_input_ids[:self.max_def_length]
                d_input_mask = [1] * len(d_input_ids)
                d_segment_ids = [0] * len(d_input_ids)

                d_input_ids = pad0(d_input_ids, self.max_def_length)
                d_input_mask = pad0(d_input_mask, self.max_def_length)
                d_segment_ids = pad0(d_segment_ids, self.max_def_length)

                e = d_input_ids, d_input_mask, d_segment_ids
                enc_entries.append(e)

            new_dict[word] = enc_entries
        return new_dict
示例#6
0
    def index(self, tokens):
        words = get_word_tokens(tokens)
        unique_words = filter_unique_words(words)
        valid_words = []
        for word in unique_words:
            if word.word in self.stopword:
                pass
            elif not self.dict_contains(word.word):
                pass
            else:
                word.location = get_locations(tokens, word)
                word.location = word.location[:self.get_max_d_loc()]
                word.enc_location = pad0(word.location, self.get_max_d_loc())
                valid_words.append(word)

        return valid_words