Exemplo n.º 1
0
    def from_doc(cls, doc):
        if not isinstance(doc, document.Document):
            raise ParseScriptError(
                'from_doc must be called with a {} instance'.format(
                    get_class_name(document.Document)))
        # get all events from document
        events = []
        # iterate through all sentences
        for sent in doc.sents:
            # iterate through all tokens
            for pred_token in sent.tokens:
                if pred_token.pos.startswith('VB'):
                    # exclude "be" verbs
                    if pred_token.lemma == 'be':
                        continue
                    # NOBUG: do not exclude stop verbs now, as both negation and
                    # particle need to be counted in detecting a stop verb,
                    # which should be excluded in constructing RichScript
                    # TODO: exclude verbs in quotes
                    # exclude modifying verbs
                    if sent.dep_graph.lookup_label('gov', pred_token.token_idx,
                                                   'xcomp'):
                        continue

                    neg = False
                    if sent.dep_graph.lookup_label('gov', pred_token.token_idx,
                                                   'neg'):
                        neg = True

                    subj_list = sent.get_subj_list(pred_token.token_idx)
                    obj_list = sent.get_obj_list(pred_token.token_idx)
                    pobj_list = sent.get_pobj_list(pred_token.token_idx)

                    if (not subj_list) and (not obj_list):
                        continue
                    if not subj_list:
                        subj_list.append(None)
                    if not obj_list:
                        obj_list.append(None)

                    for arg_tuple in product(subj_list, obj_list):
                        events.append(
                            Event.from_tokens(pred_token, neg, arg_tuple[0],
                                              arg_tuple[1], pobj_list))
        if not events:
            warn('doc {} has no events'.format(doc.doc_name))
        if not doc.corefs:
            warn('doc {} has no corefs'.format(doc.doc_name))
        # get all entities from document
        entities = [Entity.from_coref(coref) for coref in doc.corefs]
        return cls(doc.doc_name, entities, events)
Exemplo n.º 2
0
    def from_doc(cls, doc):
        check_type(doc, corenlp.Document)
        script = cls(doc.doc_name)

        # add all entities from document
        for coref in doc.corefs:
            entity = Entity.from_coref(coref)
            script.add_entity(entity)

        if not script.has_entities():
            log.warn('script {} has no entities'.format(doc.doc_name))

        # add all events from document
        for sent in doc.sents:
            # iterate through all tokens
            for pred_token in sent.tokens:
                if pred_token.pos.startswith('VB'):
                    # exclude "be" verbs
                    if pred_token.lemma == 'be':
                        continue
                    # exclude modifying verbs
                    if sent.dep_graph.lookup_label('head',
                                                   pred_token.token_idx,
                                                   'xcomp'):
                        continue
                    # TODO: exclude verbs in quotes
                    # NOBUG: do not exclude stop verbs now
                    # both negation and particle need to be counted in
                    # detecting a stop verb, we will remove stop verbs
                    # in constructing RichScript

                    # find whether the verb has negation
                    neg = False
                    if sent.dep_graph.lookup_label('head',
                                                   pred_token.token_idx,
                                                   'neg'):
                        neg = True

                    # find whether the verb has particle
                    prt = ''
                    prt_tokens = sent.lookup_label('head',
                                                   pred_token.token_idx,
                                                   'compound:prt')
                    if prt_tokens:
                        if len(prt_tokens) > 1:
                            log.warn(
                                'Predicate {} contains {} particles'.format(
                                    pred_token.pretty_print(),
                                    len(prt_tokens)))
                        prt = prt_tokens[0].lemma

                    subj_list = sent.get_subj_list(pred_token.token_idx)
                    dobj_list = sent.get_dobj_list(pred_token.token_idx)
                    pobj_list = sent.get_pobj_list(pred_token.token_idx)

                    if (not subj_list) and (not dobj_list):
                        continue
                    if not subj_list:
                        subj_list.append(None)
                    if not dobj_list:
                        dobj_list.append(None)

                    for arg_tuple in product(subj_list, dobj_list):
                        event = Event.from_tokens(pred_token,
                                                  arg_tuple[0],
                                                  arg_tuple[1],
                                                  pobj_list,
                                                  neg=neg,
                                                  prt=prt)
                        script.add_event(event)

        if not script.has_events():
            log.warn('script {} has no events'.format(doc.doc_name))

        return script