def encode_classification_instance( tokenizer, max_seq_length, inst: ClassificationInstance) -> OrderedDict: feature: OrderedDict = get_basic_input_feature(tokenizer, max_seq_length, inst.tokens, inst.seg_ids) feature['label_ids'] = create_int_feature([inst.label]) return feature
def tf_record_gen(ranked_list: Dict[str, List[SimpleRankedListEntry]], queries: Dict, text_reader: Callable[[str], str], output_path, max_seq_length: int, data_info_save_name, ): writer = RecordWriterWrap(output_path) tokenizer = get_tokenizer() dummy_label = 0 data_id_idx = 0 data_id_info = {} for query_id_str in ranked_list: query_rep = queries[query_id_str] query_str = query_rep['query'] for ranked_entry in ranked_list[query_id_str]: data_id = data_id_idx data_id_idx += 1 data_id_info[data_id] = (query_id_str, ranked_entry.doc_id) text = text_reader(ranked_entry.doc_id) tokens, segment_ids = encode_query_and_text(tokenizer, query_str, text, max_seq_length) features = get_basic_input_feature(tokenizer, max_seq_length, tokens, segment_ids) features['label_ids'] = create_int_feature([dummy_label]) features['data_id'] = create_int_feature([data_id]) writer.write_feature(features) save_to_pickle(data_id_info, data_info_save_name) writer.close()
def encode_fn(self, inst: TokenScoringInstance) -> OrderedDict: max_seq_length = self.max_seq_length tokens1: List[str] = self.tokenizer.tokenize(inst.query_text) max_seg2_len = self.max_seq_length - 3 - len(tokens1) tokens2, scores = self.tokenize_from_tokens_w_scores( inst.doc_tokens, inst.score) tokens2 = tokens2[:max_seg2_len] scores: ScoreVector = scores[:max_seg2_len] tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"] segment_ids = [0] * (len(tokens1) + 2) \ + [1] * (len(tokens2) + 1) tokens = tokens[:max_seq_length] segment_ids = segment_ids[:max_seq_length] features = get_basic_input_feature(self.tokenizer, max_seq_length, tokens, segment_ids) score_vector = pad_score_vector(scores, max_seq_length, len(tokens1)) if len(score_vector) != max_seq_length: print(score_vector) print(len(score_vector)) print(max_seq_length) print(len(scores)) print(scores) assert len(score_vector) == max_seq_length features['label_ids'] = score_vector_to_feature(score_vector) features['data_id'] = create_int_feature([inst.data_id]) return features
def encode(tokenizer, max_seq_length, t: Tuple[str, bool]): text, is_correct = t tokens1: List[str] = tokenizer.tokenize(text) tokens = ["[CLS]"] + tokens1 + ["[SEP]"] segment_ids = [0] * (len(tokens1) + 2) tokens = tokens[:max_seq_length] segment_ids = segment_ids[:max_seq_length] features = get_basic_input_feature(tokenizer, max_seq_length, tokens, segment_ids) features['label_ids'] = create_int_feature([int(is_correct)]) return features
def encode_w_data_id(tokenizer, max_seq_length, t: Instance): tokens = ["[CLS]"] + t.tokens + ["[SEP]"] segment_ids = [0] * (len(t.tokens) + 2) tokens = tokens[:max_seq_length] segment_ids = segment_ids[:max_seq_length] features = get_basic_input_feature(tokenizer, max_seq_length, tokens, segment_ids) features['label_ids'] = create_int_feature([int(t.label)]) features['data_id'] = create_int_feature([int(t.data_id)]) return features
def write(self, insts: List[Instance], out_path): writer = RecordWriterWrap(out_path) for inst in insts: feature = get_basic_input_feature(self.tokenizer, self.max_seq_length, inst.tokens, inst.seg_ids) feature["data_id"] = create_int_feature([int(inst.data_id)]) feature["label_ids"] = create_int_feature([int(inst.label)]) writer.write_feature(feature) writer.close()
def encode(tokenizer, get_tokens, max_seq_length, inst: Instance) -> OrderedDict: tokens1 = get_tokens(inst.pid1) tokens2 = get_tokens(inst.pid2) tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"] segment_ids = [0] * (len(tokens1) + 2) \ + [1] * (len(tokens2) + 1) tokens = tokens[:max_seq_length] segment_ids = segment_ids[:max_seq_length] features = get_basic_input_feature(tokenizer, max_seq_length, tokens, segment_ids) features['label_ids'] = create_int_feature([inst.label]) return features
def enc_to_feature2(tokenizer, max_seq_length, inst: QCInstanceTokenized) -> OrderedDict: seg1 = inst.query_text seg2 = inst.candidate_text input_tokens = ["[CLS]"] + seg1 + ["[SEP]"] + seg2 + ["[SEP]"] segment_ids = [0] * (len(seg1) + 2) + [1] * (len(seg2) + 1) feature = get_basic_input_feature(tokenizer, max_seq_length, input_tokens, segment_ids) feature["data_id"] = create_int_feature([int(inst.data_id)]) feature["label_ids"] = create_int_feature([int(inst.is_correct)]) return feature
def enc_to_feature(tokenizer, max_seq_length, pc: PerspectiveCandidate) -> OrderedDict: seg1 = tokenizer.tokenize(pc.claim_text) seg2 = tokenizer.tokenize(pc.p_text) input_tokens = ["[CLS]"] + seg1 + ["[SEP]"] + seg2 + ["[SEP]"] segment_ids = [0] * (len(seg1) + 2) + [1] * (len(seg2) + 1) feature = get_basic_input_feature(tokenizer, max_seq_length, input_tokens, segment_ids) feature["label_ids"] = create_int_feature([int(pc.label)]) return feature
def encode(inst: Instance) -> OrderedDict: tokens1: List[str] = tokenizer.tokenize(inst.text1) max_seg2_len = max_seq_length - 3 - len(tokens1) tokens2 = tokenizer.tokenize(inst.text2)[:max_seg2_len] tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"] segment_ids = [0] * (len(tokens1) + 2) + [1] * (len(tokens2) + 1) tokens = tokens[:max_seq_length] segment_ids = segment_ids[:max_seq_length] features = get_basic_input_feature(tokenizer, max_seq_length, tokens, segment_ids) features['label_ids'] = create_int_feature([inst.label]) features['data_id'] = create_int_feature([inst.data_id]) return features
def encode(score_paragraph: ScoreParagraph) -> OrderedDict: para_tokens: List[Subword] = score_paragraph.paragraph.subword_tokens tokens = tokens1 + ["[SEP]"] + tokens2 + ["[SEP]" ] + para_tokens + ["[SEP]"] segment_ids = [0] * (len(tokens1) + 1) + [1] * ( len(tokens2) + 1) + [2] * (len(para_tokens) + 1) tokens = tokens[:max_seq_length] segment_ids = segment_ids[:max_seq_length] features = get_basic_input_feature(tokenizer, max_seq_length, tokens, segment_ids) features['label_ids'] = create_int_feature([label]) return features
def encode_classification_feature( max_seq_length, data: Iterable[Tuple[str, str, int]]) -> Iterable[OrderedDict]: tokenizer = get_tokenizer() encoder = FirstSegmentAsDoc(max_seq_length) for query, text, label in data: q_tokens = tokenizer.tokenize(query) text_tokens = tokenizer.tokenize(text) input_tokens, segment_ids = encoder.encode(q_tokens, text_tokens)[0] feature = get_basic_input_feature(tokenizer, max_seq_length, input_tokens, segment_ids) feature['label_ids'] = create_int_feature([label]) yield feature
def _write_instances(self, insts, output_file): writer = RecordWriterWrap(output_file) for instance in insts: word_tokens, def_tokens, segment_ids = instance word_tokens_ids = self.tokenizer.convert_tokens_to_ids(word_tokens) features = get_basic_input_feature(self.tokenizer, self.max_seq_length, def_tokens, segment_ids) while len(word_tokens_ids) < self.max_word_tokens: word_tokens_ids.append(0) features["word"] = create_int_feature(word_tokens_ids) writer.write_feature(features) writer.close() tf_logging.info("Wrote %d total instances", writer.total_written)
def write_instances(self, new_inst_list, outfile): writer = RecordWriterWrap(outfile) example_numbers = [] for (inst_index, instance) in enumerate(new_inst_list): tokens, segment_ids = instance features = get_basic_input_feature(self.tokenizer, self.target_seq_length, tokens, segment_ids) features["use_context"] = create_int_feature([1]) writer.write_feature(features) writer.close() tf_logging.info("Wrote %d total instances", writer.total_written) return example_numbers
def get_feature(tokens1, tokens2, info): data_id = data_id_gen.new_id() info_list[data_id] = info tokens = tokens1 + tokens2 segment_ids = [0] * len(tokens1) + [1] * len(tokens2) tokens = tokens[:max_seq_length] segment_ids = segment_ids[:max_seq_length] features = get_basic_input_feature(tokenizer, max_seq_length, tokens, segment_ids) features['label_ids'] = create_int_feature([0]) features['data_id'] = create_int_feature([data_id]) return features
def encode(inst: Payload) -> OrderedDict: tokens1: List[str] = tokenizer.tokenize(inst.candidate_text) max_seg2_len = max_seq_length - 3 - len(tokens1) tokens2 = tokenize_from_tokens(inst.passage)[:max_seg2_len] tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"] segment_ids = [0] * (len(tokens1) + 2) \ + [1] * (len(tokens2) + 1) tokens = tokens[:max_seq_length] segment_ids = segment_ids[:max_seq_length] features = get_basic_input_feature(tokenizer, max_seq_length, tokens, segment_ids) features['label_ids'] = create_int_feature([inst.is_correct]) features['data_id'] = create_int_feature([inst.data_id]) return features
def encode_fn(inst: Instance): tokens1 = inst.tokens1 max_seg2_len = self.max_seq_length - 3 - len(tokens1) tokens2 = inst.tokens2[:max_seg2_len] tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"] segment_ids = [0] * (len(tokens1) + 2) \ + [1] * (len(tokens2) + 1) tokens = tokens[:self.max_seq_length] segment_ids = segment_ids[:self.max_seq_length] features = get_basic_input_feature(self.tokenizer, self.max_seq_length, tokens, segment_ids) features['label_ids'] = create_int_feature([inst.label]) features['data_id'] = create_int_feature([inst.data_id]) return features
def write(self, insts, out_path): writer = RecordWriterWrap(out_path) f = open(out_path + ".info", "wb") doc_id_list = [] for inst in insts: tokens, segment_ids, doc_id = inst feature = get_basic_input_feature(self.tokenizer, self.max_seq_length, tokens, segment_ids) doc_id_list.append(doc_id) writer.write_feature(feature) pickle.dump(doc_id_list, f) writer.close()
def write_instances(self, new_inst_list, outfile): writer = RecordWriterWrap(outfile) example_numbers = [] for (inst_index, instance) in enumerate(new_inst_list): features = get_basic_input_feature(self.tokenizer, self.max_seq_length, instance.tokens, instance.segment_ids) writer.write_feature(features) if inst_index < 20: log_print_inst(instance, features) writer.close() return example_numbers
def encode_fn(self, inst: QKInstance) -> OrderedDict: max_seq_length = self.max_seq_length tokens1: List[str] = self.tokenizer.tokenize(inst.query_text) max_seg2_len = self.max_seq_length - 3 - len(tokens1) tokens2 = inst.doc_tokens[:max_seg2_len] tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"] segment_ids = [0] * (len(tokens1) + 2) \ + [1] * (len(tokens2) + 1) tokens = tokens[:max_seq_length] segment_ids = segment_ids[:max_seq_length] features = get_basic_input_feature(self.tokenizer, max_seq_length, tokens, segment_ids) features['label_ids'] = create_int_feature([inst.is_correct]) features['data_id'] = create_int_feature([inst.data_id]) return features
def encode(inst: TextInstance) -> OrderedDict: tokens = tokenizer.tokenize(inst.text) max_len = max_seq_length - 2 if len(tokens) > max_len: nonlocal long_count long_count = long_count + 1 if long_count > 10 and long_warning: print("long text count", long_count) tokens = tokens[:max_len] tokens = ["[CLS]"] + tokens + ["[SEP]"] seg_ids = [0] * len(tokens) feature: OrderedDict = get_basic_input_feature(tokenizer, max_seq_length, tokens, seg_ids) feature['label_ids'] = create_int_feature([inst.label]) feature['data_id'] = create_int_feature([inst.data_id]) return feature
def encode(inst: Tuple[str, int]) -> OrderedDict: text, label = inst tokens = tokenizer.tokenize(text) max_len = max_seq_length - 2 if len(tokens) > max_len: nonlocal long_count long_count = long_count + 1 if long_count > 10: print("long text count", long_count) tokens = tokens[:max_len] tokens = ["[CLS]"] + tokens + ["[SEP]"] seg_ids = [0] * len(tokens) feature: OrderedDict = get_basic_input_feature(tokenizer, max_seq_length, tokens, seg_ids) feature['label_ids'] = create_int_feature([label]) return feature
def encode(record: Record) -> OrderedDict: tokens = ["[CLS]"] + record.claim_tokens + [ "[SEP]" ] + record.doc_tokens + ["[SEP]"] segment_ids = [0] * (len(record.claim_tokens) + 2) \ + [1] * (len(record.doc_tokens) + 1) tokens = tokens[:max_seq_length] segment_ids = segment_ids[:max_seq_length] features = get_basic_input_feature(tokenizer, max_seq_length, tokens, segment_ids) labels = [0.] * (len(record.claim_tokens) + 2) + record.scores labels += (max_seq_length - len(labels)) * [0.] label_mask = [0] * (len(record.claim_tokens) + 2) + record.valid_mask label_mask += (max_seq_length - len(label_mask)) * [0] features['label_ids'] = create_float_feature(labels) features['label_masks'] = create_int_feature(label_mask) return features
def encode(self, inst: Instance) -> OrderedDict: if not self.reverse: tokens1 = self.get_p_tokens(inst.pid) tokens2 = inst.sent else: tokens1 = inst.sent tokens2 = self.get_p_tokens(inst.pid) tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"] segment_ids = [0] * (len(tokens1) + 2) \ + [1] * (len(tokens2) + 1) max_seq_length = self.max_seq_length tokens = tokens[:max_seq_length] segment_ids = segment_ids[:max_seq_length] features = get_basic_input_feature(self.tokenizer, max_seq_length, tokens, segment_ids) features['label_ids'] = create_int_feature([0]) features['data_ids'] = create_int_feature([inst.data_id]) return features
def write_tfrecord(ranked_list_d: RankedListDict, queries: List[Query], q_rels: Dict[str, List[str]], save_path): max_seq_length = 512 tokenizer = get_tokenizer() encoder = AllSegmentAsDoc(max_seq_length) writer = RecordWriterWrap(save_path) data_id = 0 data_info = [] for query in queries: if query.qid not in ranked_list_d: print("Warning query {} not found".format(query.qid)) continue print(query.qid) ranked_list = ranked_list_d[query.qid] doc_ids = [doc_entry.doc_id for doc_entry in ranked_list] preload_man.preload(BertTokenizedCluewebDoc, doc_ids) q_tokens = tokenizer.tokenize(query.text) for doc_entry in ranked_list: try: tokens_list: List[List[str]] = load(BertTokenizedCluewebDoc, doc_entry.doc_id) tokens = flatten(tokens_list) insts: List[Tuple[List, List]] = encoder.encode(q_tokens, tokens) for inst in insts: label = doc_entry.doc_id in q_rels[query.qid] input_tokens, segment_ids = inst feature = get_basic_input_feature(tokenizer, max_seq_length, input_tokens, segment_ids) feature["label_ids"] = create_int_feature([int(label)]) feature["data_id"] = create_int_feature([int(data_id)]) writer.write_feature(feature) data_info.append((data_id, query.qid, doc_entry.doc_id)) data_id += 1 except KeyError as e: print("doc {} not found".format(doc_entry.doc_id)) return data_info
def enc_to_feature(pc: PerspectiveCandidate) -> OrderedDict: emb_model = get_aux_embedding_fn(pc.cid) seg1 = tokenizer.tokenize(pc.claim_text) seg2 = tokenizer.tokenize(pc.p_text) input_tokens = ["[CLS]"] + seg1 + ["[SEP]"] + seg2 + ["[SEP]"] aux_emb = get_word_embedding(emb_model, input_tokens, dims) aux_emb += (max_seq_length - len(aux_emb)) * [zero_vector] aux_emb = np.array(aux_emb) flat_aux_emb = np.reshape(aux_emb, [-1]) segment_ids = [0] * (len(seg1) + 2) + [1] * (len(seg2) + 1) feature = get_basic_input_feature(tokenizer, max_seq_length, input_tokens, segment_ids) feature["label_ids"] = create_int_feature([int(pc.label)]) feature["aux_emb"] = create_float_feature(flat_aux_emb) return feature