Exemplo n.º 1
0
def get_dict_input_features(tokenizer, max_def_length, max_d_loc, max_word_len,
                            segment_ids, dict_def, word_loc_list, dict_word):
    d_input_ids = tokenizer.convert_tokens_to_ids(dict_def)
    d_input_ids = d_input_ids[:max_def_length]
    d_input_mask = [1] * len(d_input_ids)

    if word_loc_list:
        target_segment = segment_ids[word_loc_list[0]]
        d_segment_ids = [target_segment] * len(d_input_ids)
    else:
        d_segment_ids = []

    if dict_word is not None:
        selected_word = tokenizer.convert_tokens_to_ids(dict_word.subword_rep)
    else:
        selected_word = []

    d_input_ids = pad0(d_input_ids, max_def_length)
    d_input_mask = pad0(d_input_mask, max_def_length)
    d_location_ids = pad0(word_loc_list[:max_d_loc], max_d_loc)
    d_segment_ids = pad0(d_segment_ids, max_def_length)
    selected_word = pad0(selected_word, max_word_len)

    features = collections.OrderedDict()
    features["d_input_ids"] = btd.create_int_feature(d_input_ids)
    features["d_input_mask"] = btd.create_int_feature(d_input_mask)
    features["d_segment_ids"] = btd.create_int_feature(d_segment_ids)
    features["d_location_ids"] = btd.create_int_feature(d_location_ids)
    features["selected_word"] = btd.create_int_feature(selected_word)
    return features
Exemplo n.º 2
0
    def feature_transformer(feature):
        new_features = collections.OrderedDict()
        success = False
        for key in feature:
            v = take(feature[key])
            if key == "input_ids":
                alt_emb_mask = [0] * len(v)
                s = set(v)
                if len(s.intersection(all_tokens)) >= min_overlap:
                    for word in seq_set:
                        pre_match = 0
                        for i in range(len(v)):
                            if v[i] == word[pre_match]:
                                pre_match += 1
                            else:
                                pre_match = 0
                            if pre_match == len(word):
                                pre_match = 0
                                for j in range(i - len(word) + 1, i + 1):
                                    alt_emb_mask[j] = 1
                                    success = True
                new_features["alt_emb_mask"] = create_int_feature(alt_emb_mask)
            new_features[key] = create_int_feature(v)

        if success:
            return new_features
        else:
            return None
Exemplo n.º 3
0
def ordered_dict_from_input_segment_mask_ids(input_ids, input_mask,
                                             segment_ids):
    features = collections.OrderedDict()
    features["input_ids"] = btd.create_int_feature(input_ids)
    features["input_mask"] = btd.create_int_feature(input_mask)
    features["segment_ids"] = btd.create_int_feature(segment_ids)
    return features
Exemplo n.º 4
0
def augment(short_records,
            long_records,
            target_len,
            save_dir,
            start_record_idx=0):
    exist_or_mkdir(save_dir)
    record_idx = start_record_idx
    print("record_idx", record_idx)

    def get_next_writer():
        return RecordWriterWrap(os.path.join(save_dir, str(record_idx)))

    writer = get_next_writer()
    cnt = 0
    while cnt < target_len:
        first_inst = short_records.__next__()
        second_inst = long_records.__next__()

        first_inst = feature_to_ordered_dict(first_inst)
        first_inst["next_sentence_labels"] = create_int_feature([1])
        second_inst = feature_to_ordered_dict(second_inst)
        second_inst["next_sentence_labels"] = create_int_feature([1])

        writer.write_feature(first_inst)
        writer.write_feature(second_inst)
        #
        cnt += 2
        if writer.total_written >= 100000:
            record_idx += 1
            print("Wrote {} data".format(cnt))
            writer.close()
            writer = get_next_writer()

    return
Exemplo n.º 5
0
def rel_filter(tfrecord_itr,
               relevance_scores: Dict[DataID, Tuple[CPIDPair, Logits, Logits]],
               cpid_to_label: Dict[CPIDPair, int]) -> Iterator[OrderedDict]:

    last_feature = None
    for features in tfrecord_itr:
        if last_feature is None:
            last_feature = features
            continue

        data_id = take(features["data_id"])[0]
        t = relevance_scores[data_id]
        cpid: CPIDPair = t[0]
        c_logits = t[1]
        p_logits = t[2]

        c_score = softmax(c_logits)[1]
        p_score = softmax(p_logits)[1]

        weight = c_score * p_score
        label: int = cpid_to_label[cpid]

        if weight > 0.5:
            new_feature = combine_segment(last_feature, features)
            #new_feature['weight'] = create_float_feature([weight])
            new_feature['label_ids'] = create_int_feature([label])
            new_feature['data_id'] = create_int_feature([data_id])
            yield new_feature
        last_feature = None
Exemplo n.º 6
0
def encode_w_data_id(tokenizer, max_seq_length, t: Instance):
    tokens = ["[CLS]"] + t.tokens + ["[SEP]"]
    segment_ids = [0] * (len(t.tokens) + 2)
    tokens = tokens[:max_seq_length]
    segment_ids = segment_ids[:max_seq_length]
    features = get_basic_input_feature(tokenizer, max_seq_length, tokens,
                                       segment_ids)
    features['label_ids'] = create_int_feature([int(t.label)])
    features['data_id'] = create_int_feature([int(t.data_id)])
    return features
Exemplo n.º 7
0
def combine_feature(lm_entry, nli_entry):
    new_features = collections.OrderedDict()
    for key in lm_entry:
        new_features[key] = create_int_feature(take(lm_entry[key]))
    for key in nli_entry:
        if key == "label_ids":
            new_features[key] = create_int_feature(take(nli_entry[key]))
        else:
            new_key = "nli_" + key
            new_features[new_key] = create_int_feature(take(nli_entry[key]))
    return new_features
Exemplo n.º 8
0
def do(data_id):
    working_dir = os.environ["TF_WORKING_DIR"]
    tokenzier = get_tokenizer()
    name1 = os.path.join(working_dir, "bert_loss", "{}.pickle".format(data_id))
    name2 = os.path.join(working_dir, "bfn_loss", "{}.pickle".format(data_id))

    tf_logging.debug("Loading " + name1)
    output1 = PredictionOutput(name1)
    tf_logging.debug("Loading " + name2)
    output2 = PredictionOutput(name2)

    assert len(output1.input_ids) == len(output2.input_ids)

    out_path = os.path.join(working_dir,
                            "loss_pred_train_data/{}".format(data_id))
    record_writer = RecordWriterWrap(out_path)
    n_inst = len(output1.input_ids)
    sep_id = tokenzier.vocab["[SEP]"]
    tf_logging.debug("Iterating")
    ticker = TimeEstimator(n_inst, "", 1000)
    for i in range(n_inst):
        if i % 1000 == 0:
            assert_input_equal(output1.input_ids[i], output2.input_ids[i])
        try:
            features = get_segment_and_mask(output1.input_ids[i], sep_id)
        except:
            try:
                sep_indice = get_sep_considering_masking(
                    output1.input_ids[i], sep_id, output1.masked_lm_ids[i],
                    output1.masked_lm_positions[i])
                features = get_segment_and_mask_inner(output1.input_ids[i],
                                                      sep_indice)
            except:
                tokens = tokenzier.convert_ids_to_tokens(output1.input_ids[i])
                print(tokenization.pretty_tokens(tokens))
                print(output1.masked_lm_ids[i])
                print(output1.masked_lm_positions[i])
                raise

        features["next_sentence_labels"] = create_int_feature([0])
        features["masked_lm_positions"] = create_int_feature(
            output1.masked_lm_positions[i])
        features["masked_lm_ids"] = create_int_feature(
            output1.masked_lm_ids[i])
        features["masked_lm_weights"] = create_float_feature(
            output1.masked_lm_weights[i])
        features["loss_base"] = create_float_feature(
            output1.masked_lm_example_loss[i])
        features["loss_target"] = create_float_feature(
            output2.masked_lm_example_loss[i])
        record_writer.write_feature(features)
        ticker.tick()

    record_writer.close()
Exemplo n.º 9
0
def get_segment_and_mask_inner(input_ids, sep_indice):
    a_len = sep_indice[0] + 1
    b_len = sep_indice[1] + 1 - a_len
    pad_len = len(input_ids) - (a_len + b_len)
    segment_ids = [0] * a_len + [1] * b_len + [0] * pad_len
    input_mask = [1] * (a_len + b_len) + [0] * pad_len
    features = collections.OrderedDict()
    features["input_ids"] = create_int_feature(input_ids)
    features["input_mask"] = create_int_feature(input_mask)
    features["segment_ids"] = create_int_feature(segment_ids)
    return features
Exemplo n.º 10
0
def get_masked_lm_features(tokenizer, max_predictions_per_seq,
                           masked_lm_positions, masked_lm_labels):
    masked_lm_positions, masked_lm_ids, masked_lm_weights = \
        get_masked_lm_features_as_list(tokenizer, max_predictions_per_seq, masked_lm_positions, masked_lm_labels)

    features = collections.OrderedDict()
    features["masked_lm_positions"] = btd.create_int_feature(
        masked_lm_positions)
    features["masked_lm_ids"] = btd.create_int_feature(masked_lm_ids)
    features["masked_lm_weights"] = btd.create_float_feature(masked_lm_weights)
    return features
Exemplo n.º 11
0
def enc_to_feature2(tokenizer, max_seq_length, inst: QCInstanceTokenized) -> OrderedDict:
    seg1 = inst.query_text
    seg2 = inst.candidate_text

    input_tokens = ["[CLS]"] + seg1 + ["[SEP]"] + seg2 + ["[SEP]"]
    segment_ids = [0] * (len(seg1) + 2) + [1] * (len(seg2) + 1)

    feature = get_basic_input_feature(tokenizer, max_seq_length, input_tokens, segment_ids)
    feature["data_id"] = create_int_feature([int(inst.data_id)])
    feature["label_ids"] = create_int_feature([int(inst.is_correct)])
    return feature
Exemplo n.º 12
0
def encode_sr(sr: SegmentRepresentation,
              max_seq_length,
              label_id,
              data_id=None) -> collections.OrderedDict:
    input_ids, input_mask, segment_ids = pack_sr(max_seq_length, sr)
    features = collections.OrderedDict()
    features["input_ids"] = create_int_feature(input_ids)
    features["input_mask"] = create_int_feature(input_mask)
    features["segment_ids"] = create_int_feature(segment_ids)
    if data_id is not None:
        features["data_id"] = create_int_feature([data_id])
    features["label_ids"] = create_int_feature([label_id])
    return features
Exemplo n.º 13
0
def work(job_id):
    outfile = os.path.join(working_dir, "BLC_data", "{}".format(job_id))
    if os.path.exists(outfile):
        return "Skip"
    tf_logging.debug("Loading data")
    data = load(job_id)
    tf_logging.debug("Done")
    if data is None:
        return "No Input"

    writer = RecordWriterWrap(outfile)

    batch_size, seq_length = data[0]['input_ids'].shape
    keys = list(data[0].keys())

    vectors = flatten_batches(data)
    basic_keys = "input_ids", "input_mask", "segment_ids"
    any_key = keys[0]
    data_len = len(vectors[any_key])
    num_predictions = len(vectors["grouped_positions"][0][0])

    for i in range(data_len):
        mask_valid = [0] * seq_length
        loss1_arr = [0] * seq_length
        loss2_arr = [0] * seq_length
        positions = vectors["grouped_positions"][i]
        num_trials = len(positions)
        for t_i in range(num_trials):
            for p_i in range(num_predictions):
                loc = vectors["grouped_positions"][i][t_i][p_i]
                loss1 = vectors["grouped_loss1"][i][t_i][p_i]
                loss2 = vectors["grouped_loss2"][i][t_i][p_i]

                loss1_arr[loc] = loss1
                loss2_arr[loc] = loss2
                assert mask_valid[loc] == 0
                mask_valid[loc] = 1

        features = collections.OrderedDict()
        for key in basic_keys:
            features[key] = create_int_feature(vectors[key][i])

        features["loss_valid"] = create_int_feature(mask_valid)
        features["loss1"] = create_float_feature(loss1_arr)
        features["loss2"] = create_float_feature(loss2_arr)
        features["next_sentence_labels"] = create_int_feature([0])
        writer.write_feature(features)
        #if i < 20:
        #    log_print_feature(features)
    writer.close()
    return "Done"
Exemplo n.º 14
0
 def get_feature(tokens1, tokens2, info):
     data_id = data_id_gen.new_id()
     info_list[data_id] = info
     tokens = tokens1 + tokens2
     segment_ids = [0] * len(tokens1) + [1] * len(tokens2)
     tokens = tokens[:max_seq_length]
     segment_ids = segment_ids[:max_seq_length]
     features = get_basic_input_feature(tokenizer,
                                        max_seq_length,
                                        tokens,
                                        segment_ids)
     features['label_ids'] = create_int_feature([0])
     features['data_id'] = create_int_feature([data_id])
     return features
Exemplo n.º 15
0
        def encode_fn(inst: Instance):
            tokens1 = inst.tokens1
            max_seg2_len = self.max_seq_length - 3 - len(tokens1)

            tokens2 = inst.tokens2[:max_seg2_len]
            tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"]

            segment_ids = [0] * (len(tokens1) + 2) \
                          + [1] * (len(tokens2) + 1)
            tokens = tokens[:self.max_seq_length]
            segment_ids = segment_ids[:self.max_seq_length]
            features = get_basic_input_feature(self.tokenizer, self.max_seq_length, tokens, segment_ids)
            features['label_ids'] = create_int_feature([inst.label])
            features['data_id'] = create_int_feature([inst.data_id])
            return features
Exemplo n.º 16
0
def encode(tokenizer, max_seq_length, inst: QCInstance) -> OrderedDict:
    tokens1: List[str] = tokenizer.tokenize(inst.query_text)
    max_seg2_len = max_seq_length - 3 - len(tokens1)
    tokens2 = tokenizer.tokenize(inst.candidate_text)
    tokens2 = tokens2[:max_seg2_len]
    tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"]
    segment_ids = [0] * (len(tokens1) + 2) \
                  + [1] * (len(tokens2) + 1)
    tokens = tokens[:max_seq_length]
    segment_ids = segment_ids[:max_seq_length]
    features = get_basic_input_feature(tokenizer, max_seq_length, tokens,
                                       segment_ids)
    features['label_ids'] = create_int_feature([inst.is_correct])
    features['data_id'] = create_int_feature([inst.data_id])
    return features
Exemplo n.º 17
0
    def write_instance_to_example_files(self, instances, output_files):
        """Create TF example files from `TrainingInstance`s."""
        writers = []
        for output_file in output_files:
            writers.append(tf.python_io.TFRecordWriter(output_file))

        writer_index = 0
        total_written = 0
        for (inst_index, instance) in enumerate(instances):
            input_ids = self.tokenizer.convert_tokens_to_ids(instance.tokens)
            input_mask = [1] * len(input_ids)
            segment_ids = list(instance.segment_ids)

            max_seq_length = self.max_seq_length
            assert len(input_ids) <= self.max_seq_length
            while len(input_ids) < self.max_seq_length:
                input_ids.append(0)
                input_mask.append(0)
                segment_ids.append(0)

            assert len(input_ids) == max_seq_length
            assert len(input_mask) == max_seq_length
            assert len(segment_ids) == max_seq_length

            next_sentence_label = 1 if instance.is_random_next else 0

            features = collections.OrderedDict()
            features["input_ids"] = btd.create_int_feature(input_ids)
            features["input_mask"] = btd.create_int_feature(input_mask)
            features["segment_ids"] = btd.create_int_feature(segment_ids)
            features["next_sentence_labels"] = btd.create_int_feature(
                [next_sentence_label])

            tf_example = tf.train.Example(features=tf.train.Features(
                feature=features))

            writers[writer_index].write(tf_example.SerializeToString())
            writer_index = (writer_index + 1) % len(writers)

            total_written += 1

            if inst_index < 20:
                log_print_inst(instance, features)

        for writer in writers:
            writer.close()

        tf_logging.info("Wrote %d total instances", total_written)
Exemplo n.º 18
0
def main(dir_path):
    output_path = os.path.join(dir_path, "all_balanced")
    pos_insts = []
    neg_insts = []
    all_insts = [neg_insts, pos_insts]

    for i in range(665):
        p = os.path.join(dir_path, str(i))
        if os.path.exists(p):
            for record in load_record(p):
                new_features = collections.OrderedDict()
                for key in record:
                    new_features[key] = create_int_feature(take(record[key]))

                label = take(record['label_ids'])[0]
                all_insts[label].append(new_features)

    random.shuffle(pos_insts)
    random.shuffle(neg_insts)

    num_sel = min(len(pos_insts), len(neg_insts))
    print("{} insts per label".format(num_sel))

    insts_to_write = pos_insts[:num_sel] + neg_insts[:num_sel]
    writer = RecordWriterWrap(output_path)
    foreach(writer.write_feature, insts_to_write)
Exemplo n.º 19
0
def augment_topic_ids(records, topic_id, save_path):
    writer = RecordWriterWrap(save_path)
    for feature in records:
        first_inst = feature_to_ordered_dict(feature)
        first_inst["topic_ids"] = create_int_feature([topic_id])
        writer.write_feature(first_inst)

    writer.close()
Exemplo n.º 20
0
def transform_datapoint(data_point):
    input_ids = data_point['input_ids']
    max_seq_length = len(input_ids)
    assert max_seq_length == 200
    p, h = split_p_h_with_input_ids(input_ids, input_ids)
    segment_ids = (2+len(p)) * [0] + (1+len(h)) * [1]
    input_mask = (3+len(p)+len(h)) * [1]

    while len(segment_ids) < max_seq_length:
        input_mask.append(0)
        segment_ids.append(0)
    features = collections.OrderedDict()
    features["input_ids"] = create_int_feature(input_ids)
    features["input_mask"] = create_int_feature(input_mask)
    features["segment_ids"] = create_int_feature(segment_ids)
    features["label_ids"] = create_int_feature([data_point['label']])
    return features
Exemplo n.º 21
0
    def feature_transformer(feature):
        new_features_1 = collections.OrderedDict()
        new_features_2 = collections.OrderedDict()

        def put(feature_name):
            return create_int_feature(take(feature[feature_name]))

        new_features_1["input_ids"] = put("input_ids1")
        new_features_1["input_mask"] = put("input_mask1")
        new_features_1["segment_ids"] = put("segment_ids1")
        new_features_1["label_ids"] = create_int_feature([1])

        new_features_2["input_ids"] = put("input_ids2")
        new_features_2["input_mask"] = put("input_mask2")
        new_features_2["segment_ids"] = put("segment_ids2")
        new_features_2["label_ids"] = create_int_feature([0])

        return new_features_1, new_features_2
Exemplo n.º 22
0
def encode(tokenizer, get_tokens, max_seq_length, inst: Instance) -> OrderedDict:
    tokens1 = get_tokens(inst.pid1)
    tokens2 = get_tokens(inst.pid2)
    tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"]
    segment_ids = [0] * (len(tokens1) + 2) \
                  + [1] * (len(tokens2) + 1)
    tokens = tokens[:max_seq_length]
    segment_ids = segment_ids[:max_seq_length]
    features = get_basic_input_feature(tokenizer, max_seq_length, tokens, segment_ids)
    features['label_ids'] = create_int_feature([inst.label])
    return features
Exemplo n.º 23
0
def main(dir_path):
    output_path = os.path.join(dir_path, "all")
    writer = RecordWriterWrap(output_path)
    for i in range(665):
        p = os.path.join(dir_path, str(i))
        if os.path.exists(p):
            for record in load_record(p):
                new_features = collections.OrderedDict()
                for key in record:
                    new_features[key] = create_int_feature(take(record[key]))
                writer.write_feature(new_features)
Exemplo n.º 24
0
    def feature_transformer(feature):
        new_features = collections.OrderedDict()

        def put(feature_name):
            return create_int_feature(take(feature[feature_name]))

        for left_right_idx in [1, 2]:
            input_names = [input_names1, input_names2][left_right_idx - 1]
            input_ids = take(feature["input_ids{}".format(left_right_idx)])
            input_masks = take(feature["input_mask{}".format(left_right_idx)])
            cls_loc = []
            last_non_pad = -1
            for i in range(seq_length):
                if input_ids[i] == 101:
                    cls_loc.append(i)

                if input_masks[i]:
                    last_non_pad = i

            assert last_non_pad >= 0
            assert last_non_pad > cls_loc[-1]
            assert len(cls_loc) <= max_num_seg

            num_seg = len(cls_loc)
            input_building = {}
            for name in input_names:
                input_building[name] = []

            for i in range(num_seg):
                st = cls_loc[i]
                ed = cls_loc[i + 1] if i + 1 < num_seg else last_non_pad + 1
                pad_len = window_size - (ed - st)

                for input_name in input_names:
                    arr = take(feature[input_name])
                    seq = arr[st:ed] + pad_len * [0]
                    input_building[input_name].extend(seq)

            n_empty_seg = max_num_seg - num_seg
            for i in range(n_empty_seg):
                for input_name in input_names:
                    input_building[input_name].extend([0] * window_size)

            for input_name in input_names:
                checksum1 = sum(input_building[input_name])
                checksum2 = sum(take(feature[input_name]))
                assert checksum1 == checksum2

            for input_name in input_names:
                new_features[input_name] = create_int_feature(
                    input_building[input_name])

        new_features["data_ids"] = put("data_ids")
        return new_features
Exemplo n.º 25
0
    def feature_transformer(feature):
        new_features = collections.OrderedDict()
        mapping = {0: 0, 1: 1, 2: 1}

        for key in feature:
            v = take(feature[key])
            if key == "label_ids":
                v = [mapping[v[0]]]
            new_features[key] = create_int_feature(v)

        return new_features
Exemplo n.º 26
0
    def feature_transformer(feature):
        new_features = collections.OrderedDict()
        mapping = {0: 0, 1: 0, 2: 1}

        for key in feature:
            l = take(feature[key])
            if key == "segment_ids":
                l = list([mapping[v] for v in l])

            new_features[key] = create_int_feature(l)

        return new_features
Exemplo n.º 27
0
 def instance_to_features(self, instance):
     basic_features = self.get_basic_input_features(instance.tokens,
                                                    instance.segment_ids)
     lm_mask_features = self.get_masked_lm_features(
         instance.masked_lm_positions, instance.masked_lm_labels)
     features = OrderedDictBuilder()
     features.extend(basic_features)
     features.extend(lm_mask_features)
     next_sentence_label = 1 if instance.is_random_next else 0
     features["next_sentence_labels"] = btd.create_int_feature(
         [next_sentence_label])
     return features
Exemplo n.º 28
0
    def encode(score_paragraph: ScoreParagraph) -> OrderedDict:
        para_tokens: List[Subword] = score_paragraph.paragraph.subword_tokens

        tokens = tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"
                                                  ] + para_tokens + ["[SEP]"]
        segment_ids = [0] * (len(tokens1) + 1) + [1] * (
            len(tokens2) + 1) + [2] * (len(para_tokens) + 1)
        tokens = tokens[:max_seq_length]
        segment_ids = segment_ids[:max_seq_length]
        features = get_basic_input_feature(tokenizer, max_seq_length, tokens,
                                           segment_ids)
        features['label_ids'] = create_int_feature([label])
        return features
Exemplo n.º 29
0
def augment_topic_ids(records, save_path):
    writer = RecordWriterWrap(save_path)

    for feature in records:
        first_inst = feature_to_ordered_dict(feature)
        input_ids = first_inst["input_ids"].int64_list.value
        token_ids = input_ids[1]
        topic = token_ids_to_topic[token_ids]
        topic_id = data_generator.argmining.ukp_header.all_topics.index(topic)
        first_inst["topic_ids"] = create_int_feature([topic_id])
        writer.write_feature(first_inst)

    writer.close()
Exemplo n.º 30
0
    def _write_instances(self, insts, output_file):
        writer = RecordWriterWrap(output_file)

        for instance in insts:
            word_tokens, def_tokens, segment_ids = instance
            word_tokens_ids = self.tokenizer.convert_tokens_to_ids(word_tokens)
            features = get_basic_input_feature(self.tokenizer, self.max_seq_length, def_tokens, segment_ids)
            while len(word_tokens_ids) < self.max_word_tokens:
                word_tokens_ids.append(0)
            features["word"] = create_int_feature(word_tokens_ids)
            writer.write_feature(features)
        writer.close()
        tf_logging.info("Wrote %d total instances", writer.total_written)