def gen_tf_record(): sequence_length = 300 data_loader = get_biobert_nli_data_loader(sequence_length) todo = [("train", [data_loader.train_file]), ("dev", [data_loader.dev_file])] batch_size = 32 dir_path = os.path.join(output_path, "biobert_mnli_{}".format(sequence_length)) exist_or_mkdir(dir_path) for name, files in todo[::-1]: output_file = os.path.join(dir_path, name) writer = RecordWriterWrap(output_file) for file in files: for e in data_loader.example_generator(file): f = entry_to_feature_dict(e) f["is_real_example"] = create_int_feature([1]) writer.write_feature(f) if name == "dev": while writer.total_written % batch_size != 0: f["is_real_example"] = create_int_feature([0]) writer.write_feature(f) writer.close() print("Wrote %d total instances" % writer.total_written)
def convert_to_unpaired(source_path, output_path): def feature_transformer(feature): new_features_1 = collections.OrderedDict() new_features_2 = collections.OrderedDict() def put(feature_name): return create_int_feature(take(feature[feature_name])) new_features_1["input_ids"] = put("input_ids1") new_features_1["input_mask"] = put("input_mask1") new_features_1["segment_ids"] = put("segment_ids1") new_features_1["label_ids"] = create_int_feature([1]) new_features_2["input_ids"] = put("input_ids2") new_features_2["input_mask"] = put("input_mask2") new_features_2["segment_ids"] = put("segment_ids2") new_features_2["label_ids"] = create_int_feature([0]) return new_features_1, new_features_2 writer = RecordWriterWrap(output_path) feature_itr = load_record_v2(source_path) for feature in feature_itr: new_features_1, new_features_2 = feature_transformer(feature) writer.write_feature(new_features_1) writer.write_feature(new_features_2) writer.close()
def write_records(records: List[Record], max_seq_length, output_path): tokenizer = get_tokenizer() def encode(record: Record) -> OrderedDict: tokens = ["[CLS]"] + record.claim_tokens + [ "[SEP]" ] + record.doc_tokens + ["[SEP]"] segment_ids = [0] * (len(record.claim_tokens) + 2) \ + [1] * (len(record.doc_tokens) + 1) tokens = tokens[:max_seq_length] segment_ids = segment_ids[:max_seq_length] features = get_basic_input_feature(tokenizer, max_seq_length, tokens, segment_ids) labels = [0.] * (len(record.claim_tokens) + 2) + record.scores labels += (max_seq_length - len(labels)) * [0.] label_mask = [0] * (len(record.claim_tokens) + 2) + record.valid_mask label_mask += (max_seq_length - len(label_mask)) * [0] features['label_ids'] = create_float_feature(labels) features['label_masks'] = create_int_feature(label_mask) return features writer = RecordWriterWrap(output_path) features: List[OrderedDict] = lmap(encode, records) foreach(writer.write_feature, features) writer.close()
def write_records(records: List[Payload], max_seq_length, output_path): tokenizer = get_tokenizer() def tokenize_from_tokens(tokens: List[str]) -> List[str]: output = [] for t in tokens: ts = tokenizer.tokenize(t) output.extend(ts) return output def encode(inst: Payload) -> OrderedDict: tokens1: List[str] = tokenizer.tokenize(inst.candidate_text) max_seg2_len = max_seq_length - 3 - len(tokens1) tokens2 = tokenize_from_tokens(inst.passage)[:max_seg2_len] tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"] segment_ids = [0] * (len(tokens1) + 2) \ + [1] * (len(tokens2) + 1) tokens = tokens[:max_seq_length] segment_ids = segment_ids[:max_seq_length] features = get_basic_input_feature(tokenizer, max_seq_length, tokens, segment_ids) features['label_ids'] = create_int_feature([inst.is_correct]) features['data_id'] = create_int_feature([inst.data_id]) return features writer = RecordWriterWrap(output_path) features: List[OrderedDict] = lmap(encode, records) foreach(writer.write_feature, features) writer.close()
def tf_record_gen(ranked_list: Dict[str, List[SimpleRankedListEntry]], queries: Dict, text_reader: Callable[[str], str], output_path, max_seq_length: int, data_info_save_name, ): writer = RecordWriterWrap(output_path) tokenizer = get_tokenizer() dummy_label = 0 data_id_idx = 0 data_id_info = {} for query_id_str in ranked_list: query_rep = queries[query_id_str] query_str = query_rep['query'] for ranked_entry in ranked_list[query_id_str]: data_id = data_id_idx data_id_idx += 1 data_id_info[data_id] = (query_id_str, ranked_entry.doc_id) text = text_reader(ranked_entry.doc_id) tokens, segment_ids = encode_query_and_text(tokenizer, query_str, text, max_seq_length) features = get_basic_input_feature(tokenizer, max_seq_length, tokens, segment_ids) features['label_ids'] = create_int_feature([dummy_label]) features['data_id'] = create_int_feature([data_id]) writer.write_feature(features) save_to_pickle(data_id_info, data_info_save_name) writer.close()
def work(self, job_id): qid_to_max_seg_idx: Dict[str, Dict[str, int]] = self.best_seg_collector.get_best_seg_info_2d(job_id) qids = self.query_group[job_id] output_path = os.path.join(self.out_dir, str(job_id)) writer = RecordWriterWrap(output_path) for qid in qids: sr_per_qid = self.seg_resource_loader.load_for_qid(qid) doc_ids = list(qid_to_max_seg_idx[qid].keys()) max_seg_idx_d = qid_to_max_seg_idx[qid] pos_doc, neg_doc = self.pool_pos_neg_doc(doc_ids, sr_per_qid) def get_max_seg(sr_per_doc: SRPerQueryDoc) -> SegmentRepresentation: max_seg_idx = max_seg_idx_d[sr_per_doc.doc_id] try: seg = sr_per_doc.segs[max_seg_idx] except IndexError: print('qid={} doc_id={}'.format(qid, sr_per_doc.doc_id)) print("max_seg_idx={} but len(segs)={}".format(max_seg_idx, len(sr_per_doc.segs))) raise return seg pos_seg = get_max_seg(pos_doc) neg_seg = get_max_seg(neg_doc) feature = encode_sr_pair(pos_seg, neg_seg, self.max_seq_length, ) writer.write_feature(feature) writer.close()
def work(self, job_id): qid_to_max_seg_idx: Dict[Tuple[str, str], int] = self.best_seg_collector.get_best_seg_info(job_id) qids = self.query_group[job_id] output_path = os.path.join(self.out_dir, str(job_id)) writer = RecordWriterWrap(output_path) for qid in qids: sr_per_qid = self.seg_resource_loader.load_for_qid(qid) for sr_per_doc in sr_per_qid.sr_per_query_doc: if len(sr_per_doc.segs) == 1: continue qdid = qid, sr_per_doc.doc_id max_seg_idx = qid_to_max_seg_idx[qdid] label_id = sr_per_doc.label try: seg = sr_per_doc.segs[max_seg_idx] feature = encode_sr(seg, self.max_seq_length, label_id, ) writer.write_feature(feature) except IndexError: print('qid={} doc_id={}'.format(qid, sr_per_doc.doc_id)) print("max_seg_idx={} but len(segs)={}".format(max_seg_idx, len(sr_per_doc.segs))) raise writer.close()
def augment_topic_ids(records, topic_id, save_path): writer = RecordWriterWrap(save_path) for feature in records: first_inst = feature_to_ordered_dict(feature) first_inst["topic_ids"] = create_int_feature([topic_id]) writer.write_feature(first_inst) writer.close()
def tfrecord_convertor(source_path: FilePath, output_path: FilePath, feature_transformer): writer = RecordWriterWrap(output_path) feature_itr = load_record_v2(source_path) for feature in feature_itr: new_features = feature_transformer(feature) writer.write_feature(new_features) writer.close()
def encode2(itr_lm, itr_nli, out_path): writer = RecordWriterWrap(out_path) for nli_entry in itr_nli: lm_entry = itr_lm.__next__() new_features = combine_feature(lm_entry, nli_entry) writer.write_feature(new_features) print("Wrote {} items".format(writer.total_written)) writer.close()
def do_filtering(file_path, out_path, condition_fn, debug_call_back=None): writer = RecordWriterWrap(out_path) for item in load_record(file_path): features = feature_to_ordered_dict(item) if condition_fn(features): if debug_call_back is not None: debug_call_back(features) writer.write_feature(features) writer.close()
def baseline_bert_gen_unbal_resplit(outpath, split): tokenizer = get_tokenizer() data: List[PerspectiveCandidate] = load_data_point_50_train_val(split) max_seq_length = 512 writer = RecordWriterWrap(outpath) for entry in data: writer.write_feature(enc_to_feature(tokenizer, max_seq_length, entry)) writer.close()
def write_pairwise_record(tokenizer, max_seq_length, insts, out_path): writer = RecordWriterWrap(out_path) for inst in insts: (tokens, segment_ids), (tokens2, segment_ids2) = inst features = combine_features(tokens, segment_ids, tokens2, segment_ids2, tokenizer, max_seq_length) writer.write_feature(features) writer.close()
def work(self, job_id): tfrecord_path = os.path.join(self.input_dir, str(job_id)) features = load_record(tfrecord_path) save_path = os.path.join(self.out_dir, str(job_id)) writer = RecordWriterWrap(save_path) for f in rel_filter(features, self.relevance_scores, self.cpid_to_label): writer.write_feature(f) writer.close()
def work(self, job_id): tfrecord_path = os.path.join(self.input_dir, str(job_id)) features = load_record(tfrecord_path) save_path = os.path.join(self.out_dir, str(job_id)) writer = RecordWriterWrap(save_path) for f in collect_passages(features, self.relevance_scores, self.cpid_to_label, self.num_max_para, self.window_size): writer.write_feature(f) writer.close()
def work(self, job_id): features: List[ParagraphClaimPersFeature] = pickle.load( open(os.path.join(self.input_dir, str(job_id)), "rb")) writer = RecordWriterWrap(os.path.join(self.out_dir, str(job_id))) for f in features: f2: ParagraphFeature = to_paragraph_feature(f) encoded_list: List[OrderedDict] = format_paragraph_features( self.tokenizer, self.max_seq_length, f2) foreach(writer.write_feature, encoded_list) writer.close()
def write(self, insts: List[Instance], out_path): writer = RecordWriterWrap(out_path) for inst in insts: feature = get_basic_input_feature(self.tokenizer, self.max_seq_length, inst.tokens, inst.seg_ids) feature["data_id"] = create_int_feature([int(inst.data_id)]) feature["label_ids"] = create_int_feature([int(inst.label)]) writer.write_feature(feature) writer.close()
def do(data_id): working_dir = os.environ["TF_WORKING_DIR"] tokenzier = get_tokenizer() name1 = os.path.join(working_dir, "bert_loss", "{}.pickle".format(data_id)) name2 = os.path.join(working_dir, "bfn_loss", "{}.pickle".format(data_id)) tf_logging.debug("Loading " + name1) output1 = PredictionOutput(name1) tf_logging.debug("Loading " + name2) output2 = PredictionOutput(name2) assert len(output1.input_ids) == len(output2.input_ids) out_path = os.path.join(working_dir, "loss_pred_train_data/{}".format(data_id)) record_writer = RecordWriterWrap(out_path) n_inst = len(output1.input_ids) sep_id = tokenzier.vocab["[SEP]"] tf_logging.debug("Iterating") ticker = TimeEstimator(n_inst, "", 1000) for i in range(n_inst): if i % 1000 == 0: assert_input_equal(output1.input_ids[i], output2.input_ids[i]) try: features = get_segment_and_mask(output1.input_ids[i], sep_id) except: try: sep_indice = get_sep_considering_masking( output1.input_ids[i], sep_id, output1.masked_lm_ids[i], output1.masked_lm_positions[i]) features = get_segment_and_mask_inner(output1.input_ids[i], sep_indice) except: tokens = tokenzier.convert_ids_to_tokens(output1.input_ids[i]) print(tokenization.pretty_tokens(tokens)) print(output1.masked_lm_ids[i]) print(output1.masked_lm_positions[i]) raise features["next_sentence_labels"] = create_int_feature([0]) features["masked_lm_positions"] = create_int_feature( output1.masked_lm_positions[i]) features["masked_lm_ids"] = create_int_feature( output1.masked_lm_ids[i]) features["masked_lm_weights"] = create_float_feature( output1.masked_lm_weights[i]) features["loss_base"] = create_float_feature( output1.masked_lm_example_loss[i]) features["loss_target"] = create_float_feature( output2.masked_lm_example_loss[i]) record_writer.write_feature(features) ticker.tick() record_writer.close()
def augment_topic_ids(records, save_path): writer = RecordWriterWrap(save_path) for feature in records: first_inst = feature_to_ordered_dict(feature) input_ids = first_inst["input_ids"].int64_list.value token_ids = input_ids[1] topic = token_ids_to_topic[token_ids] topic_id = data_generator.argmining.ukp_header.all_topics.index(topic) first_inst["topic_ids"] = create_int_feature([topic_id]) writer.write_feature(first_inst) writer.close()
def _write_instances(self, insts, output_file): writer = RecordWriterWrap(output_file) for instance in insts: word_tokens, def_tokens, segment_ids = instance word_tokens_ids = self.tokenizer.convert_tokens_to_ids(word_tokens) features = get_basic_input_feature(self.tokenizer, self.max_seq_length, def_tokens, segment_ids) while len(word_tokens_ids) < self.max_word_tokens: word_tokens_ids.append(0) features["word"] = create_int_feature(word_tokens_ids) writer.write_feature(features) writer.close() tf_logging.info("Wrote %d total instances", writer.total_written)
def work(job_id): outfile = os.path.join(working_dir, "BLC_data", "{}".format(job_id)) if os.path.exists(outfile): return "Skip" tf_logging.debug("Loading data") data = load(job_id) tf_logging.debug("Done") if data is None: return "No Input" writer = RecordWriterWrap(outfile) batch_size, seq_length = data[0]['input_ids'].shape keys = list(data[0].keys()) vectors = flatten_batches(data) basic_keys = "input_ids", "input_mask", "segment_ids" any_key = keys[0] data_len = len(vectors[any_key]) num_predictions = len(vectors["grouped_positions"][0][0]) for i in range(data_len): mask_valid = [0] * seq_length loss1_arr = [0] * seq_length loss2_arr = [0] * seq_length positions = vectors["grouped_positions"][i] num_trials = len(positions) for t_i in range(num_trials): for p_i in range(num_predictions): loc = vectors["grouped_positions"][i][t_i][p_i] loss1 = vectors["grouped_loss1"][i][t_i][p_i] loss2 = vectors["grouped_loss2"][i][t_i][p_i] loss1_arr[loc] = loss1 loss2_arr[loc] = loss2 assert mask_valid[loc] == 0 mask_valid[loc] = 1 features = collections.OrderedDict() for key in basic_keys: features[key] = create_int_feature(vectors[key][i]) features["loss_valid"] = create_int_feature(mask_valid) features["loss1"] = create_float_feature(loss1_arr) features["loss2"] = create_float_feature(loss2_arr) features["next_sentence_labels"] = create_int_feature([0]) writer.write_feature(features) #if i < 20: # log_print_feature(features) writer.close() return "Done"
def write_instances(self, new_inst_list, outfile): writer = RecordWriterWrap(outfile) example_numbers = [] for (inst_index, instance) in enumerate(new_inst_list): tokens, segment_ids = instance features = get_basic_input_feature(self.tokenizer, self.target_seq_length, tokens, segment_ids) features["use_context"] = create_int_feature([1]) writer.write_feature(features) writer.close() tf_logging.info("Wrote %d total instances", writer.total_written) return example_numbers
def write(self, insts, out_path): writer = RecordWriterWrap(out_path) f = open(out_path + ".info", "wb") doc_id_list = [] for inst in insts: tokens, segment_ids, doc_id = inst feature = get_basic_input_feature(self.tokenizer, self.max_seq_length, tokens, segment_ids) doc_id_list.append(doc_id) writer.write_feature(feature) pickle.dump(doc_id_list, f) writer.close()
def write_instances(self, new_inst_list, outfile): writer = RecordWriterWrap(outfile) example_numbers = [] feature_formatter = MLMFeaturizer(self.tokenizer, self.max_seq_length, self.max_predictions_per_seq) for (inst_index, instance) in enumerate(new_inst_list): features = feature_formatter.instance_to_features(instance) writer.write_feature(features) if inst_index < 20: log_print_inst(instance, features) writer.close() tf_logging.info("Wrote %d total instances", writer.total_written) return example_numbers
def write_instances(self, new_inst_list, outfile): writer = RecordWriterWrap(outfile) example_numbers = [] for (inst_index, instance) in enumerate(new_inst_list): features = get_basic_input_feature(self.tokenizer, self.max_seq_length, instance.tokens, instance.segment_ids) writer.write_feature(features) if inst_index < 20: log_print_inst(instance, features) writer.close() return example_numbers
def write(self, insts, out_path): writer = RecordWriterWrap(out_path) def tokens_to_int_feature(tokens): return create_int_feature( self.tokenizer.convert_tokens_to_ids(tokens)) for inst in insts: query_tokens, content_tokens, label = inst feature = collections.OrderedDict() feature['query'] = tokens_to_int_feature( query_tokens[:self.max_query_len]) feature['content'] = tokens_to_int_feature( content_tokens[:self.max_seq_length]) feature['label_ids'] = create_int_feature([label]) writer.write_feature(feature) writer.close()
def write_instances(self, instances, outfile): writer = RecordWriterWrap(outfile) example_numbers = [] get_basic_input_features_fn = partial(get_basic_input_feature, self.tokenizer, self.max_seq_length) for (inst_index, instance) in enumerate(instances): features = get_basic_input_features_fn(instance.tokens, instance.segment_ids) features["use_context"] = create_int_feature([1]) writer.write_feature(features) if inst_index < 20: log_print_inst(instance, features) writer.close() tf_logging.info("Wrote %d total instances", writer.total_written) return example_numbers
def gen_mismatched(): sequence_length = 300 data_loader = get_modified_nli_data_loader(sequence_length) dir_path = os.path.join(output_path, "nli_tfrecord_cls_{}".format(sequence_length)) name = "dev_mis" output_file = os.path.join(dir_path, name) batch_size = 32 writer = RecordWriterWrap(output_file) for e in data_loader.example_generator(data_loader.dev_file2): f = entry_to_feature_dict(e) f["is_real_example"] = create_int_feature([1]) writer.write_feature(f) while writer.total_written % batch_size != 0: f["is_real_example"] = create_int_feature([0]) writer.write_feature(f) writer.close() print("Wrote %d total instances" % writer.total_written)
def gen_with_aux_emb(outpath, aux_embedding_d, split, dims): tokenizer = get_tokenizer() data: List[PerspectiveCandidate] = load_data_point(split) max_seq_length = 512 zero_vector = [0.] * dims not_found = set() def get_aux_embedding_fn(cid): cid = int(cid) if cid in aux_embedding_d: return aux_embedding_d[cid] else: if cid not in not_found: not_found.add(cid) print("Aux embedding not found", cid) return {} def enc_to_feature(pc: PerspectiveCandidate) -> OrderedDict: emb_model = get_aux_embedding_fn(pc.cid) seg1 = tokenizer.tokenize(pc.claim_text) seg2 = tokenizer.tokenize(pc.p_text) input_tokens = ["[CLS]"] + seg1 + ["[SEP]"] + seg2 + ["[SEP]"] aux_emb = get_word_embedding(emb_model, input_tokens, dims) aux_emb += (max_seq_length - len(aux_emb)) * [zero_vector] aux_emb = np.array(aux_emb) flat_aux_emb = np.reshape(aux_emb, [-1]) segment_ids = [0] * (len(seg1) + 2) + [1] * (len(seg2) + 1) feature = get_basic_input_feature(tokenizer, max_seq_length, input_tokens, segment_ids) feature["label_ids"] = create_int_feature([int(pc.label)]) feature["aux_emb"] = create_float_feature(flat_aux_emb) return feature writer = RecordWriterWrap(outpath) for entry in data: writer.write_feature(enc_to_feature(entry)) writer.close()
def write_instances(self, new_inst_list, outfile): writer = RecordWriterWrap(outfile) example_numbers = [] for (inst_index, instance) in enumerate(new_inst_list): features = get_basic_input_feature(self.tokenizer, self.max_seq_length, instance.tokens, instance.segment_ids) features["next_sentence_labels"] = btd.create_int_feature([0]) writer.write_feature(features) if inst_index < 20: log_print_inst(instance, features) writer.close() tf_logging.info("Wrote %d total instances", writer.total_written) return example_numbers