def _read(self, file_path, data_type=None): """ .json file structure should be something like this: { "data": [ { "sequence": "i'm looking for a flight from New York to London.", "slots": ["O", "O", "O", "O", "O", "O", "B-city.dept", "I-city.dept" "O", "B-city.dest"] // the number of tokens in sequence.split() and tags must match }, ... ], "slots": [ // tag_key "O", // tags should be in IOB format "B-city.dept", "I-city.dept", "B-city.dest", "I-city.dest", ... ] } """ data = self._get_data(file_path) tag_idx2text, tag_text2idx = self._get_tag_dicts(data=data) helper = Helper( **{ "file_path": file_path, "tag_idx2text": tag_idx2text, "ignore_tag_idx": self.ignore_tag_idx, "cls_token": self.cls_token, "sep_token": self.sep_token, }) helper.set_model_parameter({ "num_tags": len(tag_idx2text), "ignore_tag_idx": self.ignore_tag_idx, }) helper.set_predict_helper({ "tag_idx2text": tag_idx2text, }) features, labels = [], [] for example in tqdm(data, desc=data_type): sequence_text = example["sequence"].strip().replace("\n", "") sequence_tokens = self.word_tokenizer.tokenize(sequence_text) naive_tokens = sequence_text.split() is_head_word = utils.get_is_head_of_word(naive_tokens, sequence_tokens) sequence_sub_tokens = [] tagged_sub_token_idxs = [] curr_sub_token_idx = 1 # skip CLS_TOKEN for token_idx, token in enumerate(sequence_tokens): for sub_token_pos, sub_token in enumerate( self.subword_tokenizer.tokenize(token, unit="word")): sequence_sub_tokens.append(sub_token) if is_head_word[token_idx] and sub_token_pos == 0: tagged_sub_token_idxs.append(curr_sub_token_idx) curr_sub_token_idx += 1 bert_input = [self.cls_token ] + sequence_sub_tokens + [self.sep_token] if (self.sequence_max_length is not None and data_type == "train" and len(bert_input) > self.sequence_max_length): continue if "uid" in example: data_uid = example["uid"] else: data_uid = str(uuid.uuid1()) tag_texts = example[self.tag_key] tag_idxs = [tag_text2idx[tag_text] for tag_text in tag_texts] utils.sanity_check_iob(naive_tokens, tag_texts) assert len(naive_tokens) == len(tagged_sub_token_idxs), \ f"""Wrong tagged_sub_token_idxs: followings mismatch. naive_tokens: {naive_tokens} sequence_sub_tokens: {sequence_sub_tokens} tagged_sub_token_idxs: {tagged_sub_token_idxs}""" feature_row = { "id": data_uid, "bert_input": bert_input, "tagged_sub_token_idxs": tagged_sub_token_idxs, "num_tokens": len(naive_tokens), } features.append(feature_row) label_row = { "id": data_uid, "tag_idxs": tag_idxs, "tag_texts": tag_texts, } labels.append(label_row) helper.set_example( data_uid, { "sequence": sequence_text, "sequence_sub_tokens": sequence_sub_tokens, "tag_idxs": tag_idxs, "tag_texts": tag_texts, }) return utils.make_batch(features, labels), helper.to_dict()
def _read(self, file_path, data_type=None): """ .json file structure should be something like this: { "data": [ { "sequence": "what a wonderful day!", "emotion": "happy" }, ... ], "emotion": [ // class_key "angry", "happy", "sad", ... ] } """ data = self._get_data(file_path, data_type=data_type) class_idx2text, class_text2idx = self._get_class_dicts(data=data) helper = Helper(**{ "file_path": file_path, "class_idx2text": class_idx2text, "class_text2idx": class_text2idx, }) helper.set_model_parameter({ "num_classes": len(class_idx2text), }) helper.set_predict_helper({ "class_idx2text": class_idx2text, }) features, labels = [], [] for example in tqdm(data, desc=data_type): sequence = example["sequence"].strip().replace("\n", "") sequence_words = self.word_tokenizer.tokenize(sequence) if ( self.sequence_max_length is not None and data_type == "train" and len(sequence_words) > self.sequence_max_length ): continue if "uid" in example: data_uid = example["uid"] else: data_uid = str(uuid.uuid1()) feature_row = { "id": data_uid, "sequence": sequence, } features.append(feature_row) class_text = example[self.class_key] label_row = { "id": data_uid, "class_idx": class_text2idx[class_text], "class_text": class_text, } labels.append(label_row) helper.set_example(data_uid, { "sequence": sequence, "class_idx": class_text2idx[class_text], "class_text": class_text, }) return utils.make_batch(features, labels), helper.to_dict()
def _read(self, file_path, data_type=None): """ .json file structure should be something like this: { "data": [ { "sequence": "what a wonderful day!", "emotion": "happy" }, ... ], "emotion": [ // class_key "angry", "happy", "sad", ... ] } """ data = self._get_data(file_path, data_type=data_type) class_idx2text, class_text2idx = self._get_class_dicts(data=data) helper = Helper(**{ "file_path": file_path, "class_idx2text": class_idx2text, "class_text2idx": class_text2idx, "cls_token": self.cls_token, "sep_token": self.sep_token, "dataset": SeqClsBertDataset, "metric_key": self.METRIC_KEY, }) helper.set_model_parameter({ "num_classes": len(class_idx2text), }) helper.set_predict_helper({ "class_idx2text": class_idx2text, }) features, labels = [], [] for example in tqdm(data, desc=data_type): sequence_a = utils.get_sequence_a(example) sequence_b = example.get("sequence_b", None) sequence_a_tokens = self.tokenizer.tokenize(sequence_a) sequence_b_tokens = None if sequence_b: sequence_b_tokens = self.tokenizer.tokenize(sequence_b) bert_input = utils.make_bert_input( sequence_a, sequence_b, self.tokenizer, max_seq_length=self.sequence_max_length, data_type=data_type, cls_token=self.cls_token, sep_token=self.sep_token, input_type=self.input_type, ) if bert_input is None: continue if "uid" in example: data_uid = example["uid"] else: data_uid = str(uuid.uuid1()) # token_type(segment_ids) will be added in dataset feature_row = { "id": data_uid, "bert_input": bert_input, } features.append(feature_row) class_text = example[self.class_key] label_row = { "id": data_uid, "class_idx": class_text2idx[class_text], "class_text": class_text, } labels.append(label_row) helper.set_example(data_uid, { "sequence_a": sequence_a, "sequence_a_tokens": sequence_a_tokens, "sequence_b": sequence_b, "sequence_b_tokens": sequence_b_tokens, "class_idx": class_text2idx[class_text], "class_text": class_text, }) if self.is_test and len(features) >= 10: break return utils.make_batch(features, labels), helper.to_dict()