示例#1
0
def _fix_word(i, index, annot_id, h5fn_graph, h5fn_word, pad_ind):
    """
    Turn a detection list into what we want: some text, as well as some tags.
    :param tokenized_sent: Tokenized sentence with detections collapsed to a list.
    :param old_det_to_new_ind: Mapping of the old ID -> new ID (which will be used as the tag)
    :param obj_to_type: [person, person, pottedplant] indexed by the old labels
    :return: tokenized sentence

    """
    #_fix_adj(i, index, annot_id, h5fn_graph, pad_ind):
    ##_fix_word(i, index, annot_id,  h5fn_word, pad_ind):
    new_tokenization_with_tags = []

    with h5py.File(h5fn_graph, 'r') as h5:
        grp_items = {k: np.array(v) for k, v in h5[str(index)].items()}
        if annot_id != grp_items[f'annot_id']:
            raise ValueError("annot_id is different!!")
        node_num_list = grp_items[f'top_50_{i}']

    bert_embs = np.zeros([len(node_num_list), 768])
    for i, tok in enumerate(node_num_list):
        new_tokenization_with_tags.append((tok, pad_ind))
        with h5py.File(h5fn_word, 'r') as h5:
            grp_items = {
                k: np.array(v, dtype=np.float16)
                for k, v in h5[str(tok)].items()
            }
            bert_embs[i, :] = grp_items[f'word']
    text_field = BertField([Token(x[0]) for x in new_tokenization_with_tags],
                           bert_embs,
                           padding_value=0)
    tags = SequenceLabelField([x[1] for x in new_tokenization_with_tags],
                              text_field)
    return text_field, tags
def _fix_tokenization(tokenized_sent, bert_embs, old_det_to_new_ind, obj_to_type, token_indexers, pad_ind=-1):
    """
    Turn a detection list into what we want: some text, as well as some tags.
    :param tokenized_sent: Tokenized sentence with detections collapsed to a list.
    :param old_det_to_new_ind: Mapping of the old ID -> new ID (which will be used as the tag)
    :param obj_to_type: [person, person, pottedplant] indexed by the old labels
    :return: tokenized sentence
    """

    new_tokenization_with_tags = []
    for tok in tokenized_sent:
        if isinstance(tok, list):
            for int_name in tok:
                obj_type = obj_to_type[int_name]
                new_ind = old_det_to_new_ind[int_name]
                if new_ind < 0:
                    raise ValueError("Oh no, the new index is negative! that means it's invalid. {} {}".format(
                        tokenized_sent, old_det_to_new_ind
                    ))
                text_to_use = GENDER_NEUTRAL_NAMES[
                    new_ind % len(GENDER_NEUTRAL_NAMES)] if obj_type == 'person' else obj_type
                new_tokenization_with_tags.append((text_to_use, new_ind))
        else:
            new_tokenization_with_tags.append((tok, pad_ind))

    text_field = BertField([Token(x[0]) for x in new_tokenization_with_tags],
                           bert_embs,
                           padding_value=0)
    tags = SequenceLabelField([x[1] for x in new_tokenization_with_tags], text_field)
    return text_field, tags
示例#3
0
    def __call__(self, tokens, embeddings):
        """ tokens: Tokenized sentence with detections collapsed to a list. """
        new_tokens_with_tags = []
        for tok in tokens:
            if isinstance(tok, list):
                for int_name in tok:
                    obj_type = self.obj2type[int_name]
                    new_ind = self.obj2ind[int_name]
                    assert new_ind >= 0
                    # 与R2C保持一致
                    if obj_type == 'person':
                        text2use = GENDER_NEUTRAL_NAMES[
                            (new_ind - self.obj_start_ind) %
                            len(GENDER_NEUTRAL_NAMES)]
                    else:
                        text2use = obj_type
                    new_tokens_with_tags.append((text2use, new_ind))
            else:
                new_tokens_with_tags.append((tok, self.pad_ind))

        # 记得回来删
        assert len(new_tokens_with_tags) == embeddings.shape[0]
        text_field = BertField([Token(x[0]) for x in new_tokens_with_tags],
                               embeddings,
                               padding_value=0)
        tags = SequenceLabelField([x[1] for x in new_tokens_with_tags],
                                  text_field)
        return text_field, tags
def _fix_visual_concept(visual_concept, visual_concept_num, h5fn, pad_ind):
    """
    Turn a detection list into what we want: some text, as well as some tags.
    :param tokenized_sent: Tokenized sentence with detections collapsed to a list.
    :param old_det_to_new_ind: Mapping of the old ID -> new ID (which will be used as the tag)
    :param obj_to_type: [person, person, pottedplant] indexed by the old labels
    :return: tokenized sentence
    """
    bert_embs = np.zeros([len(visual_concept),768])
    new_tokenization_with_tags = []
    for i,tok in enumerate(visual_concept):
        new_tokenization_with_tags.append((tok, pad_ind))
        with h5py.File(h5fn, 'r') as h5:
            grp_items = {k: np.array(v) for k, v in h5[str(visual_concept_num[i])].items()}
            bert_embs[i,:] = grp_items[f'word']
    text_field = BertField([Token(x[0]) for x in new_tokenization_with_tags],
                           bert_embs,
                           padding_value=0)
    tags = SequenceLabelField([x[1] for x in new_tokenization_with_tags], text_field)
    return text_field, tags