def _read(self, file_path, data_type=None): file_path = self.data_handler.read(file_path, return_path=True) file_path = Path(file_path) data_dir = file_path.parent file_name = file_path.stem db_path = data_dir / f"{file_name}.db" table_path = data_dir / f"{file_name}.tables.jsonl" self.dbengine = DBEngine(db_path) helper = {"file_path": file_path, "db_path": db_path, "examples": {}} features, labels = [], [] sql_datas, table_data = self.load_data(file_path, table_path, data_type=data_type) for sql_data in tqdm(sql_datas, desc=data_type): question = sql_data["question"] table_id = sql_data["table_id"] column_headers = table_data[table_id]["header"] feature_row = {"column": column_headers, "question": question} data_uid = str(uuid.uuid1()) conditions_value_position = self.get_coditions_value_position( sql_data["question"], [x[2] for x in sql_data["sql"]["conds"]] ) sql_query = Query.from_dict(sql_data["sql"], ordered=True) execution_result = self.dbengine.execute_query(table_id, sql_query, lower=True) label_row = { "id": data_uid, "table_id": table_id, "tokenized_question": self.word_tokenizer.tokenize(question), "aggregator_idx": sql_data["sql"]["agg"], "select_column_idx": sql_data["sql"]["sel"], "conditions_num": len(sql_data["sql"]["conds"]), "conditions_column_idx": [x[0] for x in sql_data["sql"]["conds"]], "conditions_operator_idx": [x[1] for x in sql_data["sql"]["conds"]], "conditions_value_string": [str(x[2]) for x in sql_data["sql"]["conds"]], "conditions_value_position": conditions_value_position, "sql_query": sql_query, "execution_result": execution_result, } features.append(feature_row) labels.append(label_row) helper["examples"][data_uid] = { "question": question, "sql_query": sql_query, "execution_result": execution_result, } if self.is_test and len(labels) == 10: break return make_batch(features, labels), helper
def __call__(self, features, labels, apply_pad_labels=(), apply_pad_values=()): self.collate(features) self.collate(labels, apply_pad=False, apply_pad_labels=apply_pad_labels, apply_pad_values=apply_pad_values) return make_batch(features, labels)
def test_make_batch(): features = { "f1": 0, "f2": 1, "f3": 3, } labels = { "l1": 0, "l2": 1, "l3": 2, } batch = make_batch(features, labels) assert batch.features == features assert batch.labels == labels
def test_batch_sort_by_key(): features = [ {"f1": "long long long"}, {"f1": "short"}, {"f1": "mid mid"} ] labels = [ {"l1": 3}, {"l1": 1}, {"l1": 2}, ] batch = make_batch(features, labels) batch.sort_by_key("f1") assert batch.features == sorted(features, key=lambda x: len(x["f1"]))
def _read(self, file_path, data_type=None): """ .json file structure should be something like this: { "data": [ { "sequence": "i'm looking for a flight from New York to London.", "slots": ["O", "O", "O", "O", "O", "O", "B-city.dept", "I-city.dept" "O", "B-city.dest"] // the number of tokens in sequence.split() and tags must match }, ... ], "slots": [ // tag_key "O", // tags should be in IOB format "B-city.dept", "I-city.dept", "B-city.dest", "I-city.dest", ... ] } """ data, raw_dataset = self._get_data(file_path) tag_idx2text, tag_text2idx = self._get_tag_dicts(data=data) helper = { "file_path": file_path, "examples": {}, "raw_dataset": raw_dataset, "tag_idx2text": tag_idx2text, "ignore_tag_idx": self.ignore_tag_idx, "cls_token": self.CLS_TOKEN, "sep_token": self.SEP_TOKEN, "unk_token": self.UNK_TOKEN, "model": { "num_tags": len(tag_idx2text), "ignore_tag_idx": self.ignore_tag_idx, }, "predict_helper": { "tag_idx2text": tag_idx2text, } } features, labels = [], [] for example in tqdm(raw_dataset, desc=data_type): sequence_text = example["sequence"].strip().replace("\n", "") sequence_tokens = self.word_tokenizer.tokenize(sequence_text) naive_tokens = sequence_text.split() is_head_word = utils.get_is_head_of_word(naive_tokens, sequence_tokens) sequence_sub_tokens = [] tagged_sub_token_idxs = [] curr_sub_token_idx = 1 # skip CLS_TOKEN for token_idx, token in enumerate(sequence_tokens): for sub_token_pos, sub_token in enumerate( self.subword_tokenizer.tokenize(token, unit="word") ): sequence_sub_tokens.append(sub_token) if is_head_word[token_idx] and sub_token_pos == 0: tagged_sub_token_idxs.append(curr_sub_token_idx) curr_sub_token_idx += 1 bert_input = [self.CLS_TOKEN] + sequence_sub_tokens + [self.SEP_TOKEN] if ( self.sequence_max_length is not None and data_type == "train" and len(bert_input) > self.sequence_max_length ): continue if "uid" in example: data_uid = example["uid"] else: data_uid = str(uuid.uuid1()) tag_texts = example[self.tag_key] tag_idxs = [tag_text2idx[tag_text] for tag_text in tag_texts] utils.sanity_check_iob(naive_tokens, tag_texts) assert len(naive_tokens) == len(tagged_sub_token_idxs), \ f"""Wrong tagged_sub_token_idxs: followings mismatch. naive_tokens: {naive_tokens} sequence_sub_tokens: {sequence_sub_tokens} tagged_sub_token_idxs: {tagged_sub_token_idxs}""" feature_row = { "id": data_uid, "bert_input": bert_input, "tagged_sub_token_idxs": tagged_sub_token_idxs, "num_tokens": len(naive_tokens), } features.append(feature_row) label_row = { "id": data_uid, "tag_idxs": tag_idxs, "tag_texts": tag_texts, } labels.append(label_row) helper["examples"][data_uid] = { "sequence": sequence_text, "sequence_sub_tokens": sequence_sub_tokens, "tag_idxs": tag_idxs, "tag_texts": tag_texts, } return make_batch(features, labels), helper
def _read(self, file_path, data_type=None): """ .json file structure should be something like this: { "data": [ { "sequence_a": "what a wonderful day!", "sequence_b": "what a great day!", "score": 0.9 }, ... ] } """ data = self._get_data(file_path, data_type=data_type) helper = { "file_path": file_path, "examples": {}, "cls_token": self.CLS_TOKEN, "sep_token": self.SEP_TOKEN, "unk_token": self.UNK_TOKEN, "model": {}, "predict_helper": {} } features, labels = [], [] for example in tqdm(data, desc=data_type): sequence_a = utils.get_sequence_a(example) sequence_b = example.get("sequence_b", None) sequence_a_sub_tokens = self.subword_tokenizer.tokenize(sequence_a) sequence_b_sub_tokens = None bert_input = [self.CLS_TOKEN ] + sequence_a_sub_tokens + [self.SEP_TOKEN] if sequence_b is not None: sequence_b_sub_tokens = self.subword_tokenizer.tokenize( sequence_b) bert_input += sequence_b_sub_tokens + [self.SEP_TOKEN] if (self.sequence_max_length is not None and data_type == "train" and len(bert_input) > self.sequence_max_length): continue if "uid" in example: data_uid = example["uid"] else: data_uid = str(uuid.uuid1()) feature_row = { "id": data_uid, "bert_input": bert_input, } features.append(feature_row) score = example[self.label_key] label_row = { "id": data_uid, "score": score, } labels.append(label_row) helper["examples"][data_uid] = { "sequence_a": sequence_a, "sequence_a_sub_tokens": sequence_a_sub_tokens, "sequence_b": sequence_b, "sequence_b_sub_tokens": sequence_b_sub_tokens, "score": score, } if self.is_test and len(features) >= 10: break return make_batch(features, labels), helper
def __call__(self, features, labels): self.collate(features) self.collate(labels, apply_pad=False) return make_batch(features, labels)
def _read(self, file_path, data_type=None): """ .json file structure should be something like this: { "data": [ { "sequence": "what a wonderful day!", "emotion": "happy" }, ... ], "emotion": [ // class_key "angry", "happy", "sad", ... ] } """ data, raw_dataset = self._get_data(file_path, data_type=data_type) class_idx2text, class_text2idx = self._get_class_dicts(data=data) helper = { "file_path": file_path, "examples": {}, "raw_dataset": raw_dataset, "class_idx2text": class_idx2text, "class_text2idx": class_text2idx, "cls_token": self.CLS_TOKEN, "sep_token": self.SEP_TOKEN, "unk_token": self.UNK_TOKEN, "continue_symbol": self.CONTINUE_SYMBOL, "model": { "num_classes": len(class_idx2text), }, "predict_helper": { "class_idx2text": class_idx2text, } } features, labels = [], [] for example in tqdm(raw_dataset, desc=data_type): sequence_text = example["sequence"].strip().replace("\n", "") sequence_sub_tokens = self.subword_tokenizer.tokenize( sequence_text) bert_input = [self.CLS_TOKEN ] + sequence_sub_tokens + [self.SEP_TOKEN] if (self.sequence_max_length is not None and data_type == "train" and len(bert_input) > self.sequence_max_length): continue if "uid" in example: data_uid = example["uid"] else: data_uid = str(uuid.uuid1()) feature_row = { "id": data_uid, "bert_input": bert_input, } features.append(feature_row) class_text = example[self.class_key] label_row = { "id": data_uid, "class_idx": class_text2idx[class_text], "class_text": class_text, } labels.append(label_row) helper["examples"][data_uid] = { "sequence": sequence_text, "sequence_sub_tokens": sequence_sub_tokens, "class_idx": class_text2idx[class_text], "class_text": class_text, } return make_batch(features, labels), helper
def _read(self, file_path, data_type=None): word_tokenized_error_count, subword_tokenized_error_count = 0, 0 if data_type != "train": self.context_stride = 64 # NOTE: hard-code data = self.data_handler.read(file_path) squad = json.loads(data) if "data" in squad: squad = squad["data"] helper = { "file_path": file_path, "examples": {}, "raw_dataset": squad, "cls_token": self.CLS_TOKEN, "sep_token": self.SEP_TOKEN, "model": { "lang_code": self.lang_code, }, } features, labels = [], [] for article in tqdm(squad, desc=data_type): for paragraph in article["paragraphs"]: context_text = paragraph["context"].replace("``", '" ').replace( "''", '" ') context_tokens = self.word_tokenizer.tokenize(context_text) context_spans, char_to_word_offset = self._convert_to_spans( context_text, context_tokens) context_tokens = [ Token(text, span) for (text, span) in zip(context_tokens, context_spans) ] context_sub_tokens = [] for token in context_tokens: for sub_token in self.subword_tokenizer.tokenize( token.text): context_sub_tokens.append( Token(sub_token, token.text_span)) for qa in paragraph["qas"]: question_text = qa["question"] question_text = " ".join( self.word_tokenizer.tokenize(question_text)) question_sub_tokens = [ Token(subword) for subword in self.subword_tokenizer.tokenize(question_text) ] id_ = qa["id"] answers = qa["answers"] answer_texts, answer_indices = [], [] if qa.get("is_impossible", None): answers = qa["plausible_answers"] answerable = 0 else: answers = qa["answers"] answerable = 1 for answer in answers: answer_start = answer["answer_start"] answer_end = answer_start + len(answer["text"]) - 1 answer_texts.append(answer["text"]) answer_indices.append((answer_start, answer_end)) if len(answer_indices) > 0: answer_char_start, answer_char_end = self._find_one_most_common( answer_indices) answer_word_start = char_to_word_offset[ answer_char_start] answer_word_end = char_to_word_offset[answer_char_end] char_answer_text = context_text[ answer_char_start:answer_char_end + 1] word_answer_text = context_text[ context_spans[answer_word_start][0]: context_spans[answer_word_end][1]] if not self._is_rebuild(char_answer_text, word_answer_text): logger.warning( f"word_tokenized_error: {char_answer_text} ### {word_answer_text}" ) word_tokenized_error_count += 1 else: # Unanswerable answers = ["<noanswer>"] answer_char_start, answer_char_end = -1, -1 answer_word_start, answer_word_end = -1, -1 bert_features, bert_labels = self._make_features_and_labels( context_sub_tokens, question_sub_tokens, answer_char_start, answer_char_end + 1, ) for (index, (feature, label)) in enumerate( zip(bert_features, bert_labels)): bert_tokens = feature answer_start, answer_end = label if (answer_start < 0 or answer_start >= len(bert_tokens) or answer_end >= len(bert_tokens) or bert_tokens[answer_start].text_span is None or bert_tokens[answer_end].text_span is None): continue char_start = bert_tokens[answer_start].text_span[0] char_end = bert_tokens[answer_end].text_span[1] bert_answer = context_text[char_start:char_end] if char_answer_text != bert_answer: logger.warning( f"subword_tokenized_error: {char_answer_text} ### {word_answer_text})" ) subword_tokenized_error_count += 1 feature_row = { "bert_input": [token.text for token in bert_tokens], "bert_token": bert_tokens, } features.append(feature_row) bert_id = id_ + f"#{index}" label_row = { "id": bert_id, # question_id + bert_index "answer_texts": "\t".join(answer_texts), "answer_start": answer_start, "answer_end": answer_end, "answerable": answerable, } labels.append(label_row) if id_ not in helper["examples"]: helper["examples"][id_] = { "context": context_text, "question": question_text, "answers": answer_texts, } helper["examples"][id_][ f"bert_tokens_{index}"] = bert_tokens logger.info( f"tokenized_error_count - word: {word_tokenized_error_count} | subword: {subword_tokenized_error_count}" ) return make_batch(features, labels), helper
def _read(self, file_path, data_type=None): tokenized_error_count = 0 data = self.data_handler.read(file_path) squad = json.loads(data) if "data" in squad: squad = squad["data"] helper = { "file_path": file_path, "examples": {}, # qid: {context: ..., text_span: ..., question: ..., answer_texts} "raw_dataset": squad, "model": { "lang_code": self.lang_code, }, } features, labels = [], [] for article in tqdm(squad, desc=data_type): for paragraph in article["paragraphs"]: context = paragraph["context"].replace("``", '" ').replace("''", '" ') context_words = self.word_tokenizer.tokenize(context) if ( self.context_max_length is not None and data_type == "train" and len(context_words) > self.context_max_length ): continue for qa in paragraph["qas"]: question = qa["question"].strip().replace("\n", "") id_ = qa["id"] answer_texts, answer_indices = [], [] if qa.get("is_impossible", None): answers = qa["plausible_answers"] answerable = 0 else: answers = qa["answers"] answerable = 1 for answer in answers: answer_start = answer["answer_start"] answer_end = answer_start + len(answer["text"]) answer_texts.append(answer["text"]) answer_indices.append((answer_start, answer_end)) feature_row = { "context": self._clean_text(context), "question": question, } features.append(feature_row) if len(answer_indices) > 0: answer_start, answer_end = self._find_one_most_common(answer_indices) text_spans = self._convert_to_spans(context, context_words) word_idxs = self._get_word_span_idxs(text_spans, answer_start, answer_end) word_answer_start = word_idxs[0] word_answer_end = word_idxs[-1] # To check rebuild answer: char_answer_text - word_answer_text char_answer_text = context[answer_start:answer_end] word_answer_text = context[ text_spans[word_answer_start][0] : text_spans[word_answer_end][1] ] if not self._is_rebuild(char_answer_text, word_answer_text): logger.warning(f"word_tokenized_error: {char_answer_text} ### {word_answer_text}") tokenized_error_count += 1 else: # Unanswerable answers = ["<noanswer>"] text_spans = [] answer_start, answer_end = 0, 0 word_answer_start, word_answer_end = 0, 0 label_row = { "id": id_, "answer_start": word_answer_start, "answer_end": word_answer_end, "answerable": answerable, } labels.append(label_row) helper["examples"][id_] = { "context": context, "text_span": text_spans, "question": question, "answers": answer_texts, } logger.info(f"tokenized_error_count: {tokenized_error_count} ") return make_batch(features, labels), helper