Пример #1
0
def hash_arg(arg, dep, frame, fe, event_emb_vocab, word_emb_vocab,
             typed_event_vocab, entity_represents):
    simple_dep = dep
    if hash_params.frame_formalism == 'Propbank':
        simple_dep = get_simple_dep(dep)

    entity_rep = typed_event_vocab.get_arg_entity_rep(
        arg, entity_represents.get(arg['entity_id']))

    arg_role = typed_event_vocab.get_arg_rep(simple_dep, entity_rep)
    arg_role_id = event_emb_vocab.get_index(arg_role, None)

    if arg_role_id == -1:
        arg_role = typed_event_vocab.get_arg_rep_no_dep(entity_rep)
        arg_role_id = event_emb_vocab.get_index(arg_role, None)

    if arg_role_id == -1:
        arg_role = typed_event_vocab.get_unk_arg_with_dep(simple_dep)
        arg_role_id = event_emb_vocab.get_index(arg_role, None)

    if arg_role_id == -1:
        arg_role = typed_event_vocab.get_unk_arg_rep()
        arg_role_id = event_emb_vocab.get_index(arg_role, None)

    if arg_role_id == -1:
        logging.info(
            f"The argument with {simple_dep}:{entity_rep} cannot be mapped to "
            f"vocabulary, ignore this argument.")
        return {}

    fe_id = event_emb_vocab.get_index(
        typed_event_vocab.get_fe_rep(frame, fe),
        typed_event_vocab.oovs['fe']
    )

    hashed_context = hash_context(word_emb_vocab, arg['arg_context'])

    hashed_arg = dict([(k, v) for (k, v) in arg.items()])

    if 'gold_role' in hashed_arg and hashed_arg['source'] == 'gold':
        if hash_params.frame_formalism == 'Propbank':
            hashed_arg['gold_role_id'] = get_propbank_role_index(
                hashed_arg['gold_role'])
        elif hash_params.frame_formalism == 'Framenet':
            hashed_arg['gold_role_id'] = \
                get_framenet_role_index(frame, hashed_arg['gold_role'],
                                        event_emb_vocab, typed_event_vocab)

    hashed_arg.pop('arg_context', None)
    hashed_arg.pop('role', None)

    hashed_arg['arg_role_text'] = arg_role
    hashed_arg['arg_role'] = arg_role_id
    hashed_arg['dep'] = simple_dep
    hashed_arg['represent'] = entity_rep
    hashed_arg['fe'] = fe_id
    hashed_arg['context'] = hashed_context

    return hashed_arg
Пример #2
0
def get_dep_group(arg_info):
    """
    Figure out a rough dependency group for the argument. If this is generated
    data, we can map the system dependency to the group (i.e. nsubj -> subj).
    If this is gold standard data, we should use the gold standard role to
    figure this out.

    This method is only used when we wanted to use a fix slot mode.

    :param arg_info:
    :return:
    """
    if arg_info["source"] == "gold":
        from_parsed_dep = get_simple_dep(arg_info["dep"])
        if is_propbank_dep(from_parsed_dep):
            return from_parsed_dep
        else:
            gold_role = arg_info["gold_role"]
            if hash_params.frame_formalism == "Propbank":
                pass
    else:
        pass
Пример #3
0
def hash_arg(
    arg,
    dep,
    frame,
    fe,
    event_emb_vocab,
    word_emb_vocab,
    typed_event_vocab,
    entity_represents,
):
    simple_dep = dep
    if hash_params.frame_formalism == "Propbank":
        simple_dep = get_simple_dep(dep)

    entity_rep = typed_event_vocab.get_arg_entity_rep(
        arg, entity_represents.get(arg["entity_id"]))

    arg_role = typed_event_vocab.get_arg_rep(simple_dep, entity_rep)
    arg_role_id = event_emb_vocab.get_index(arg_role, None)

    if arg_role_id == -1:
        arg_role = typed_event_vocab.get_arg_rep_no_dep(entity_rep)
        arg_role_id = event_emb_vocab.get_index(arg_role, None)

    if arg_role_id == -1:
        arg_role = typed_event_vocab.get_unk_arg_with_dep(simple_dep)
        arg_role_id = event_emb_vocab.get_index(arg_role, None)

    if arg_role_id == -1:
        arg_role = typed_event_vocab.get_unk_arg_rep()
        arg_role_id = event_emb_vocab.get_index(arg_role, None)

    if arg_role_id == -1:
        logging.info(
            f"The argument with {simple_dep}:{entity_rep} cannot be mapped to "
            f"vocabulary, ignore this argument.")
        return {}

    fe_id = event_emb_vocab.get_index(typed_event_vocab.get_fe_rep(frame, fe),
                                      typed_event_vocab.oovs["fe"])

    hashed_context = hash_context(word_emb_vocab, arg["arg_context"])

    hashed_arg = dict([(k, v) for (k, v) in arg.items()])

    if "gold_role" in hashed_arg and hashed_arg["source"] == "gold":
        if hash_params.frame_formalism == "Propbank":
            hashed_arg["gold_role_id"] = get_propbank_role_index(
                hashed_arg["gold_role"])
        elif hash_params.frame_formalism == "Framenet":
            hashed_arg["gold_role_id"] = get_framenet_role_index(
                frame, hashed_arg["gold_role"], event_emb_vocab,
                typed_event_vocab)

    hashed_arg.pop("arg_context", None)
    hashed_arg.pop("role", None)

    hashed_arg["arg_role_text"] = arg_role
    hashed_arg["arg_role"] = arg_role_id
    hashed_arg["dep"] = simple_dep
    hashed_arg["represent"] = entity_rep
    hashed_arg["fe"] = fe_id
    hashed_arg["context"] = hashed_context

    return hashed_arg
Пример #4
0
def create_sentences(doc,
                     event_vocab,
                     output_path,
                     include_frame=False,
                     use_simple_dep=False,
                     prop_arg_only=False):
    if include_frame:
        print("Adding frames to sentences.")

    doc_count = 0
    event_count = 0

    with gzip.open(doc) as data, gzip.open(output_path, 'w') as out:
        for line in data:
            try:
                doc_info = json.loads(line)
            except JSONDecodeError:
                continue

            sentence = []

            represent_by_id = {}
            for entity in doc_info['entities']:
                eid = entity['entityId']
                represent = entity['representEntityHead']
                represent_by_id[eid] = represent

            for event in doc_info['events']:
                event_count += 1

                sentence.append(event_vocab.get_pred_rep(event))

                if include_frame and not event['frame'] == 'NA':
                    frame = event_vocab.get_vocab_word(event['frame'], 'frame')
                    sentence.append(frame)

                for arg in event['arguments']:
                    dep = arg['dep']

                    if arg['argStart'] == event['predicateStart'] \
                            and arg['argEnd'] == event['predicateEnd']:
                        dep = 'root'

                    if use_simple_dep:
                        dep = get_simple_dep(dep)

                    if prop_arg_only and not is_propbank_dep(dep):
                        continue

                    sentence.append(
                        event_vocab.get_arg_rep(
                            dep, event_vocab.get_arg_entity_rep(arg, None)))

                    if include_frame and not arg['feName'] == 'NA':
                        fe = event_vocab.get_fe_rep(frame, arg['feName'])
                        if not fe == event_vocab.oovs['fe']:
                            sentence.append(fe)

                    if 'NA' in sentence:
                        pdb.set_trace()

            doc_count += 1

            out.write(str.encode(' '.join(sentence) + '\n'))

            if event_count % 1000 == 0:
                print('\rCreated sentences for {} documents, '
                      '{} events.'.format(doc_count, event_count),
                      end='')

    print('\rCreated sentences for {} documents, '
          '{} events.\n'.format(doc_count, event_count),
          end='')
Пример #5
0
def create_sentences(
    doc,
    event_vocab,
    output_path,
    include_frame=False,
    use_simple_dep=False,
    prop_arg_only=False,
):
    if include_frame:
        print("Adding frames to sentences.")

    doc_count = 0
    event_count = 0

    with gzip.open(doc) as data, gzip.open(output_path, "w") as out:
        for line in data:
            try:
                doc_info = json.loads(line)
            except JSONDecodeError:
                continue

            sentence = []

            represent_by_id = {}
            for entity in doc_info["entities"]:
                eid = entity["entityId"]
                represent = entity["representEntityHead"]
                represent_by_id[eid] = represent

            for event in doc_info["events"]:
                event_count += 1

                sentence.append(event_vocab.get_pred_rep(event))

                if include_frame and not event["frame"] == "NA":
                    frame = event_vocab.get_vocab_word(event["frame"], "frame")
                    sentence.append(frame)

                for arg in event["arguments"]:
                    dep = arg["dep"]

                    if (arg["argStart"] == event["predicateStart"]
                            and arg["argEnd"] == event["predicateEnd"]):
                        dep = "root"

                    if use_simple_dep:
                        dep = get_simple_dep(dep)

                    if prop_arg_only and not is_propbank_dep(dep):
                        continue

                    sentence.append(
                        event_vocab.get_arg_rep(
                            dep, event_vocab.get_arg_entity_rep(arg, None)))

                    if include_frame and not arg["feName"] == "NA":
                        fe = event_vocab.get_fe_rep(frame, arg["feName"])
                        if not fe == event_vocab.oovs["fe"]:
                            sentence.append(fe)

                    if "NA" in sentence:
                        pdb.set_trace()

            doc_count += 1

            out.write(str.encode(" ".join(sentence) + "\n"))

            if event_count % 1000 == 0:
                print(
                    "\rCreated sentences for {} documents, "
                    "{} events.".format(doc_count, event_count),
                    end="",
                )

    print(
        "\rCreated sentences for {} documents, "
        "{} events.\n".format(doc_count, event_count),
        end="",
    )