def get_dict_input_features(tokenizer, max_def_length, max_d_loc, max_word_len, segment_ids, dict_def, word_loc_list, dict_word): d_input_ids = tokenizer.convert_tokens_to_ids(dict_def) d_input_ids = d_input_ids[:max_def_length] d_input_mask = [1] * len(d_input_ids) if word_loc_list: target_segment = segment_ids[word_loc_list[0]] d_segment_ids = [target_segment] * len(d_input_ids) else: d_segment_ids = [] if dict_word is not None: selected_word = tokenizer.convert_tokens_to_ids(dict_word.subword_rep) else: selected_word = [] d_input_ids = pad0(d_input_ids, max_def_length) d_input_mask = pad0(d_input_mask, max_def_length) d_location_ids = pad0(word_loc_list[:max_d_loc], max_d_loc) d_segment_ids = pad0(d_segment_ids, max_def_length) selected_word = pad0(selected_word, max_word_len) features = collections.OrderedDict() features["d_input_ids"] = btd.create_int_feature(d_input_ids) features["d_input_mask"] = btd.create_int_feature(d_input_mask) features["d_segment_ids"] = btd.create_int_feature(d_segment_ids) features["d_location_ids"] = btd.create_int_feature(d_location_ids) features["selected_word"] = btd.create_int_feature(selected_word) return features
def feature_transformer(feature): new_features = collections.OrderedDict() success = False for key in feature: v = take(feature[key]) if key == "input_ids": alt_emb_mask = [0] * len(v) s = set(v) if len(s.intersection(all_tokens)) >= min_overlap: for word in seq_set: pre_match = 0 for i in range(len(v)): if v[i] == word[pre_match]: pre_match += 1 else: pre_match = 0 if pre_match == len(word): pre_match = 0 for j in range(i - len(word) + 1, i + 1): alt_emb_mask[j] = 1 success = True new_features["alt_emb_mask"] = create_int_feature(alt_emb_mask) new_features[key] = create_int_feature(v) if success: return new_features else: return None
def ordered_dict_from_input_segment_mask_ids(input_ids, input_mask, segment_ids): features = collections.OrderedDict() features["input_ids"] = btd.create_int_feature(input_ids) features["input_mask"] = btd.create_int_feature(input_mask) features["segment_ids"] = btd.create_int_feature(segment_ids) return features
def augment(short_records, long_records, target_len, save_dir, start_record_idx=0): exist_or_mkdir(save_dir) record_idx = start_record_idx print("record_idx", record_idx) def get_next_writer(): return RecordWriterWrap(os.path.join(save_dir, str(record_idx))) writer = get_next_writer() cnt = 0 while cnt < target_len: first_inst = short_records.__next__() second_inst = long_records.__next__() first_inst = feature_to_ordered_dict(first_inst) first_inst["next_sentence_labels"] = create_int_feature([1]) second_inst = feature_to_ordered_dict(second_inst) second_inst["next_sentence_labels"] = create_int_feature([1]) writer.write_feature(first_inst) writer.write_feature(second_inst) # cnt += 2 if writer.total_written >= 100000: record_idx += 1 print("Wrote {} data".format(cnt)) writer.close() writer = get_next_writer() return
def rel_filter(tfrecord_itr, relevance_scores: Dict[DataID, Tuple[CPIDPair, Logits, Logits]], cpid_to_label: Dict[CPIDPair, int]) -> Iterator[OrderedDict]: last_feature = None for features in tfrecord_itr: if last_feature is None: last_feature = features continue data_id = take(features["data_id"])[0] t = relevance_scores[data_id] cpid: CPIDPair = t[0] c_logits = t[1] p_logits = t[2] c_score = softmax(c_logits)[1] p_score = softmax(p_logits)[1] weight = c_score * p_score label: int = cpid_to_label[cpid] if weight > 0.5: new_feature = combine_segment(last_feature, features) #new_feature['weight'] = create_float_feature([weight]) new_feature['label_ids'] = create_int_feature([label]) new_feature['data_id'] = create_int_feature([data_id]) yield new_feature last_feature = None
def encode_w_data_id(tokenizer, max_seq_length, t: Instance): tokens = ["[CLS]"] + t.tokens + ["[SEP]"] segment_ids = [0] * (len(t.tokens) + 2) tokens = tokens[:max_seq_length] segment_ids = segment_ids[:max_seq_length] features = get_basic_input_feature(tokenizer, max_seq_length, tokens, segment_ids) features['label_ids'] = create_int_feature([int(t.label)]) features['data_id'] = create_int_feature([int(t.data_id)]) return features
def combine_feature(lm_entry, nli_entry): new_features = collections.OrderedDict() for key in lm_entry: new_features[key] = create_int_feature(take(lm_entry[key])) for key in nli_entry: if key == "label_ids": new_features[key] = create_int_feature(take(nli_entry[key])) else: new_key = "nli_" + key new_features[new_key] = create_int_feature(take(nli_entry[key])) return new_features
def do(data_id): working_dir = os.environ["TF_WORKING_DIR"] tokenzier = get_tokenizer() name1 = os.path.join(working_dir, "bert_loss", "{}.pickle".format(data_id)) name2 = os.path.join(working_dir, "bfn_loss", "{}.pickle".format(data_id)) tf_logging.debug("Loading " + name1) output1 = PredictionOutput(name1) tf_logging.debug("Loading " + name2) output2 = PredictionOutput(name2) assert len(output1.input_ids) == len(output2.input_ids) out_path = os.path.join(working_dir, "loss_pred_train_data/{}".format(data_id)) record_writer = RecordWriterWrap(out_path) n_inst = len(output1.input_ids) sep_id = tokenzier.vocab["[SEP]"] tf_logging.debug("Iterating") ticker = TimeEstimator(n_inst, "", 1000) for i in range(n_inst): if i % 1000 == 0: assert_input_equal(output1.input_ids[i], output2.input_ids[i]) try: features = get_segment_and_mask(output1.input_ids[i], sep_id) except: try: sep_indice = get_sep_considering_masking( output1.input_ids[i], sep_id, output1.masked_lm_ids[i], output1.masked_lm_positions[i]) features = get_segment_and_mask_inner(output1.input_ids[i], sep_indice) except: tokens = tokenzier.convert_ids_to_tokens(output1.input_ids[i]) print(tokenization.pretty_tokens(tokens)) print(output1.masked_lm_ids[i]) print(output1.masked_lm_positions[i]) raise features["next_sentence_labels"] = create_int_feature([0]) features["masked_lm_positions"] = create_int_feature( output1.masked_lm_positions[i]) features["masked_lm_ids"] = create_int_feature( output1.masked_lm_ids[i]) features["masked_lm_weights"] = create_float_feature( output1.masked_lm_weights[i]) features["loss_base"] = create_float_feature( output1.masked_lm_example_loss[i]) features["loss_target"] = create_float_feature( output2.masked_lm_example_loss[i]) record_writer.write_feature(features) ticker.tick() record_writer.close()
def get_segment_and_mask_inner(input_ids, sep_indice): a_len = sep_indice[0] + 1 b_len = sep_indice[1] + 1 - a_len pad_len = len(input_ids) - (a_len + b_len) segment_ids = [0] * a_len + [1] * b_len + [0] * pad_len input_mask = [1] * (a_len + b_len) + [0] * pad_len features = collections.OrderedDict() features["input_ids"] = create_int_feature(input_ids) features["input_mask"] = create_int_feature(input_mask) features["segment_ids"] = create_int_feature(segment_ids) return features
def get_masked_lm_features(tokenizer, max_predictions_per_seq, masked_lm_positions, masked_lm_labels): masked_lm_positions, masked_lm_ids, masked_lm_weights = \ get_masked_lm_features_as_list(tokenizer, max_predictions_per_seq, masked_lm_positions, masked_lm_labels) features = collections.OrderedDict() features["masked_lm_positions"] = btd.create_int_feature( masked_lm_positions) features["masked_lm_ids"] = btd.create_int_feature(masked_lm_ids) features["masked_lm_weights"] = btd.create_float_feature(masked_lm_weights) return features
def enc_to_feature2(tokenizer, max_seq_length, inst: QCInstanceTokenized) -> OrderedDict: seg1 = inst.query_text seg2 = inst.candidate_text input_tokens = ["[CLS]"] + seg1 + ["[SEP]"] + seg2 + ["[SEP]"] segment_ids = [0] * (len(seg1) + 2) + [1] * (len(seg2) + 1) feature = get_basic_input_feature(tokenizer, max_seq_length, input_tokens, segment_ids) feature["data_id"] = create_int_feature([int(inst.data_id)]) feature["label_ids"] = create_int_feature([int(inst.is_correct)]) return feature
def encode_sr(sr: SegmentRepresentation, max_seq_length, label_id, data_id=None) -> collections.OrderedDict: input_ids, input_mask, segment_ids = pack_sr(max_seq_length, sr) features = collections.OrderedDict() features["input_ids"] = create_int_feature(input_ids) features["input_mask"] = create_int_feature(input_mask) features["segment_ids"] = create_int_feature(segment_ids) if data_id is not None: features["data_id"] = create_int_feature([data_id]) features["label_ids"] = create_int_feature([label_id]) return features
def work(job_id): outfile = os.path.join(working_dir, "BLC_data", "{}".format(job_id)) if os.path.exists(outfile): return "Skip" tf_logging.debug("Loading data") data = load(job_id) tf_logging.debug("Done") if data is None: return "No Input" writer = RecordWriterWrap(outfile) batch_size, seq_length = data[0]['input_ids'].shape keys = list(data[0].keys()) vectors = flatten_batches(data) basic_keys = "input_ids", "input_mask", "segment_ids" any_key = keys[0] data_len = len(vectors[any_key]) num_predictions = len(vectors["grouped_positions"][0][0]) for i in range(data_len): mask_valid = [0] * seq_length loss1_arr = [0] * seq_length loss2_arr = [0] * seq_length positions = vectors["grouped_positions"][i] num_trials = len(positions) for t_i in range(num_trials): for p_i in range(num_predictions): loc = vectors["grouped_positions"][i][t_i][p_i] loss1 = vectors["grouped_loss1"][i][t_i][p_i] loss2 = vectors["grouped_loss2"][i][t_i][p_i] loss1_arr[loc] = loss1 loss2_arr[loc] = loss2 assert mask_valid[loc] == 0 mask_valid[loc] = 1 features = collections.OrderedDict() for key in basic_keys: features[key] = create_int_feature(vectors[key][i]) features["loss_valid"] = create_int_feature(mask_valid) features["loss1"] = create_float_feature(loss1_arr) features["loss2"] = create_float_feature(loss2_arr) features["next_sentence_labels"] = create_int_feature([0]) writer.write_feature(features) #if i < 20: # log_print_feature(features) writer.close() return "Done"
def get_feature(tokens1, tokens2, info): data_id = data_id_gen.new_id() info_list[data_id] = info tokens = tokens1 + tokens2 segment_ids = [0] * len(tokens1) + [1] * len(tokens2) tokens = tokens[:max_seq_length] segment_ids = segment_ids[:max_seq_length] features = get_basic_input_feature(tokenizer, max_seq_length, tokens, segment_ids) features['label_ids'] = create_int_feature([0]) features['data_id'] = create_int_feature([data_id]) return features
def encode_fn(inst: Instance): tokens1 = inst.tokens1 max_seg2_len = self.max_seq_length - 3 - len(tokens1) tokens2 = inst.tokens2[:max_seg2_len] tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"] segment_ids = [0] * (len(tokens1) + 2) \ + [1] * (len(tokens2) + 1) tokens = tokens[:self.max_seq_length] segment_ids = segment_ids[:self.max_seq_length] features = get_basic_input_feature(self.tokenizer, self.max_seq_length, tokens, segment_ids) features['label_ids'] = create_int_feature([inst.label]) features['data_id'] = create_int_feature([inst.data_id]) return features
def encode(tokenizer, max_seq_length, inst: QCInstance) -> OrderedDict: tokens1: List[str] = tokenizer.tokenize(inst.query_text) max_seg2_len = max_seq_length - 3 - len(tokens1) tokens2 = tokenizer.tokenize(inst.candidate_text) tokens2 = tokens2[:max_seg2_len] tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"] segment_ids = [0] * (len(tokens1) + 2) \ + [1] * (len(tokens2) + 1) tokens = tokens[:max_seq_length] segment_ids = segment_ids[:max_seq_length] features = get_basic_input_feature(tokenizer, max_seq_length, tokens, segment_ids) features['label_ids'] = create_int_feature([inst.is_correct]) features['data_id'] = create_int_feature([inst.data_id]) return features
def write_instance_to_example_files(self, instances, output_files): """Create TF example files from `TrainingInstance`s.""" writers = [] for output_file in output_files: writers.append(tf.python_io.TFRecordWriter(output_file)) writer_index = 0 total_written = 0 for (inst_index, instance) in enumerate(instances): input_ids = self.tokenizer.convert_tokens_to_ids(instance.tokens) input_mask = [1] * len(input_ids) segment_ids = list(instance.segment_ids) max_seq_length = self.max_seq_length assert len(input_ids) <= self.max_seq_length while len(input_ids) < self.max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length next_sentence_label = 1 if instance.is_random_next else 0 features = collections.OrderedDict() features["input_ids"] = btd.create_int_feature(input_ids) features["input_mask"] = btd.create_int_feature(input_mask) features["segment_ids"] = btd.create_int_feature(segment_ids) features["next_sentence_labels"] = btd.create_int_feature( [next_sentence_label]) tf_example = tf.train.Example(features=tf.train.Features( feature=features)) writers[writer_index].write(tf_example.SerializeToString()) writer_index = (writer_index + 1) % len(writers) total_written += 1 if inst_index < 20: log_print_inst(instance, features) for writer in writers: writer.close() tf_logging.info("Wrote %d total instances", total_written)
def main(dir_path): output_path = os.path.join(dir_path, "all_balanced") pos_insts = [] neg_insts = [] all_insts = [neg_insts, pos_insts] for i in range(665): p = os.path.join(dir_path, str(i)) if os.path.exists(p): for record in load_record(p): new_features = collections.OrderedDict() for key in record: new_features[key] = create_int_feature(take(record[key])) label = take(record['label_ids'])[0] all_insts[label].append(new_features) random.shuffle(pos_insts) random.shuffle(neg_insts) num_sel = min(len(pos_insts), len(neg_insts)) print("{} insts per label".format(num_sel)) insts_to_write = pos_insts[:num_sel] + neg_insts[:num_sel] writer = RecordWriterWrap(output_path) foreach(writer.write_feature, insts_to_write)
def augment_topic_ids(records, topic_id, save_path): writer = RecordWriterWrap(save_path) for feature in records: first_inst = feature_to_ordered_dict(feature) first_inst["topic_ids"] = create_int_feature([topic_id]) writer.write_feature(first_inst) writer.close()
def transform_datapoint(data_point): input_ids = data_point['input_ids'] max_seq_length = len(input_ids) assert max_seq_length == 200 p, h = split_p_h_with_input_ids(input_ids, input_ids) segment_ids = (2+len(p)) * [0] + (1+len(h)) * [1] input_mask = (3+len(p)+len(h)) * [1] while len(segment_ids) < max_seq_length: input_mask.append(0) segment_ids.append(0) features = collections.OrderedDict() features["input_ids"] = create_int_feature(input_ids) features["input_mask"] = create_int_feature(input_mask) features["segment_ids"] = create_int_feature(segment_ids) features["label_ids"] = create_int_feature([data_point['label']]) return features
def feature_transformer(feature): new_features_1 = collections.OrderedDict() new_features_2 = collections.OrderedDict() def put(feature_name): return create_int_feature(take(feature[feature_name])) new_features_1["input_ids"] = put("input_ids1") new_features_1["input_mask"] = put("input_mask1") new_features_1["segment_ids"] = put("segment_ids1") new_features_1["label_ids"] = create_int_feature([1]) new_features_2["input_ids"] = put("input_ids2") new_features_2["input_mask"] = put("input_mask2") new_features_2["segment_ids"] = put("segment_ids2") new_features_2["label_ids"] = create_int_feature([0]) return new_features_1, new_features_2
def encode(tokenizer, get_tokens, max_seq_length, inst: Instance) -> OrderedDict: tokens1 = get_tokens(inst.pid1) tokens2 = get_tokens(inst.pid2) tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"] segment_ids = [0] * (len(tokens1) + 2) \ + [1] * (len(tokens2) + 1) tokens = tokens[:max_seq_length] segment_ids = segment_ids[:max_seq_length] features = get_basic_input_feature(tokenizer, max_seq_length, tokens, segment_ids) features['label_ids'] = create_int_feature([inst.label]) return features
def main(dir_path): output_path = os.path.join(dir_path, "all") writer = RecordWriterWrap(output_path) for i in range(665): p = os.path.join(dir_path, str(i)) if os.path.exists(p): for record in load_record(p): new_features = collections.OrderedDict() for key in record: new_features[key] = create_int_feature(take(record[key])) writer.write_feature(new_features)
def feature_transformer(feature): new_features = collections.OrderedDict() def put(feature_name): return create_int_feature(take(feature[feature_name])) for left_right_idx in [1, 2]: input_names = [input_names1, input_names2][left_right_idx - 1] input_ids = take(feature["input_ids{}".format(left_right_idx)]) input_masks = take(feature["input_mask{}".format(left_right_idx)]) cls_loc = [] last_non_pad = -1 for i in range(seq_length): if input_ids[i] == 101: cls_loc.append(i) if input_masks[i]: last_non_pad = i assert last_non_pad >= 0 assert last_non_pad > cls_loc[-1] assert len(cls_loc) <= max_num_seg num_seg = len(cls_loc) input_building = {} for name in input_names: input_building[name] = [] for i in range(num_seg): st = cls_loc[i] ed = cls_loc[i + 1] if i + 1 < num_seg else last_non_pad + 1 pad_len = window_size - (ed - st) for input_name in input_names: arr = take(feature[input_name]) seq = arr[st:ed] + pad_len * [0] input_building[input_name].extend(seq) n_empty_seg = max_num_seg - num_seg for i in range(n_empty_seg): for input_name in input_names: input_building[input_name].extend([0] * window_size) for input_name in input_names: checksum1 = sum(input_building[input_name]) checksum2 = sum(take(feature[input_name])) assert checksum1 == checksum2 for input_name in input_names: new_features[input_name] = create_int_feature( input_building[input_name]) new_features["data_ids"] = put("data_ids") return new_features
def feature_transformer(feature): new_features = collections.OrderedDict() mapping = {0: 0, 1: 1, 2: 1} for key in feature: v = take(feature[key]) if key == "label_ids": v = [mapping[v[0]]] new_features[key] = create_int_feature(v) return new_features
def feature_transformer(feature): new_features = collections.OrderedDict() mapping = {0: 0, 1: 0, 2: 1} for key in feature: l = take(feature[key]) if key == "segment_ids": l = list([mapping[v] for v in l]) new_features[key] = create_int_feature(l) return new_features
def instance_to_features(self, instance): basic_features = self.get_basic_input_features(instance.tokens, instance.segment_ids) lm_mask_features = self.get_masked_lm_features( instance.masked_lm_positions, instance.masked_lm_labels) features = OrderedDictBuilder() features.extend(basic_features) features.extend(lm_mask_features) next_sentence_label = 1 if instance.is_random_next else 0 features["next_sentence_labels"] = btd.create_int_feature( [next_sentence_label]) return features
def encode(score_paragraph: ScoreParagraph) -> OrderedDict: para_tokens: List[Subword] = score_paragraph.paragraph.subword_tokens tokens = tokens1 + ["[SEP]"] + tokens2 + ["[SEP]" ] + para_tokens + ["[SEP]"] segment_ids = [0] * (len(tokens1) + 1) + [1] * ( len(tokens2) + 1) + [2] * (len(para_tokens) + 1) tokens = tokens[:max_seq_length] segment_ids = segment_ids[:max_seq_length] features = get_basic_input_feature(tokenizer, max_seq_length, tokens, segment_ids) features['label_ids'] = create_int_feature([label]) return features
def augment_topic_ids(records, save_path): writer = RecordWriterWrap(save_path) for feature in records: first_inst = feature_to_ordered_dict(feature) input_ids = first_inst["input_ids"].int64_list.value token_ids = input_ids[1] topic = token_ids_to_topic[token_ids] topic_id = data_generator.argmining.ukp_header.all_topics.index(topic) first_inst["topic_ids"] = create_int_feature([topic_id]) writer.write_feature(first_inst) writer.close()
def _write_instances(self, insts, output_file): writer = RecordWriterWrap(output_file) for instance in insts: word_tokens, def_tokens, segment_ids = instance word_tokens_ids = self.tokenizer.convert_tokens_to_ids(word_tokens) features = get_basic_input_feature(self.tokenizer, self.max_seq_length, def_tokens, segment_ids) while len(word_tokens_ids) < self.max_word_tokens: word_tokens_ids.append(0) features["word"] = create_int_feature(word_tokens_ids) writer.write_feature(features) writer.close() tf_logging.info("Wrote %d total instances", writer.total_written)