def _fix_word(i, index, annot_id, h5fn_graph, h5fn_word, pad_ind): """ Turn a detection list into what we want: some text, as well as some tags. :param tokenized_sent: Tokenized sentence with detections collapsed to a list. :param old_det_to_new_ind: Mapping of the old ID -> new ID (which will be used as the tag) :param obj_to_type: [person, person, pottedplant] indexed by the old labels :return: tokenized sentence """ #_fix_adj(i, index, annot_id, h5fn_graph, pad_ind): ##_fix_word(i, index, annot_id, h5fn_word, pad_ind): new_tokenization_with_tags = [] with h5py.File(h5fn_graph, 'r') as h5: grp_items = {k: np.array(v) for k, v in h5[str(index)].items()} if annot_id != grp_items[f'annot_id']: raise ValueError("annot_id is different!!") node_num_list = grp_items[f'top_50_{i}'] bert_embs = np.zeros([len(node_num_list), 768]) for i, tok in enumerate(node_num_list): new_tokenization_with_tags.append((tok, pad_ind)) with h5py.File(h5fn_word, 'r') as h5: grp_items = { k: np.array(v, dtype=np.float16) for k, v in h5[str(tok)].items() } bert_embs[i, :] = grp_items[f'word'] text_field = BertField([Token(x[0]) for x in new_tokenization_with_tags], bert_embs, padding_value=0) tags = SequenceLabelField([x[1] for x in new_tokenization_with_tags], text_field) return text_field, tags
def _fix_tokenization(tokenized_sent, bert_embs, old_det_to_new_ind, obj_to_type, token_indexers, pad_ind=-1): """ Turn a detection list into what we want: some text, as well as some tags. :param tokenized_sent: Tokenized sentence with detections collapsed to a list. :param old_det_to_new_ind: Mapping of the old ID -> new ID (which will be used as the tag) :param obj_to_type: [person, person, pottedplant] indexed by the old labels :return: tokenized sentence """ new_tokenization_with_tags = [] for tok in tokenized_sent: if isinstance(tok, list): for int_name in tok: obj_type = obj_to_type[int_name] new_ind = old_det_to_new_ind[int_name] if new_ind < 0: raise ValueError("Oh no, the new index is negative! that means it's invalid. {} {}".format( tokenized_sent, old_det_to_new_ind )) text_to_use = GENDER_NEUTRAL_NAMES[ new_ind % len(GENDER_NEUTRAL_NAMES)] if obj_type == 'person' else obj_type new_tokenization_with_tags.append((text_to_use, new_ind)) else: new_tokenization_with_tags.append((tok, pad_ind)) text_field = BertField([Token(x[0]) for x in new_tokenization_with_tags], bert_embs, padding_value=0) tags = SequenceLabelField([x[1] for x in new_tokenization_with_tags], text_field) return text_field, tags
def __call__(self, tokens, embeddings): """ tokens: Tokenized sentence with detections collapsed to a list. """ new_tokens_with_tags = [] for tok in tokens: if isinstance(tok, list): for int_name in tok: obj_type = self.obj2type[int_name] new_ind = self.obj2ind[int_name] assert new_ind >= 0 # 与R2C保持一致 if obj_type == 'person': text2use = GENDER_NEUTRAL_NAMES[ (new_ind - self.obj_start_ind) % len(GENDER_NEUTRAL_NAMES)] else: text2use = obj_type new_tokens_with_tags.append((text2use, new_ind)) else: new_tokens_with_tags.append((tok, self.pad_ind)) # 记得回来删 assert len(new_tokens_with_tags) == embeddings.shape[0] text_field = BertField([Token(x[0]) for x in new_tokens_with_tags], embeddings, padding_value=0) tags = SequenceLabelField([x[1] for x in new_tokens_with_tags], text_field) return text_field, tags
def _fix_visual_concept(visual_concept, visual_concept_num, h5fn, pad_ind): """ Turn a detection list into what we want: some text, as well as some tags. :param tokenized_sent: Tokenized sentence with detections collapsed to a list. :param old_det_to_new_ind: Mapping of the old ID -> new ID (which will be used as the tag) :param obj_to_type: [person, person, pottedplant] indexed by the old labels :return: tokenized sentence """ bert_embs = np.zeros([len(visual_concept),768]) new_tokenization_with_tags = [] for i,tok in enumerate(visual_concept): new_tokenization_with_tags.append((tok, pad_ind)) with h5py.File(h5fn, 'r') as h5: grp_items = {k: np.array(v) for k, v in h5[str(visual_concept_num[i])].items()} bert_embs[i,:] = grp_items[f'word'] text_field = BertField([Token(x[0]) for x in new_tokenization_with_tags], bert_embs, padding_value=0) tags = SequenceLabelField([x[1] for x in new_tokenization_with_tags], text_field) return text_field, tags