예제 #1
0
def validate_entity_mention_token_ref_sequences(comm):
    valid = True
    for entityMentionSet in lun(comm.entityMentionSetList):
        for entityMention in lun(entityMentionSet.mentionList):
            valid &= validate_token_ref_sequence(
                comm, entityMention.tokens)
    return valid
예제 #2
0
def validate_situations(comm):
    valid = True

    entity_uuidString_set = _get_entity_uuidString_set(comm)
    situation_mention_uuidString_set = _get_situation_mention_uuidString_set(
        comm)
    situation_uuidString_set = _get_situation_uuidString_set(comm)

    for situationSet in lun(comm.situationSetList):
        for situation in lun(situationSet.situationList):
            for argument in lun(situation.argumentList):
                if (argument.situationId and
                        argument.situationId.uuidString not in
                        situation_uuidString_set):
                    valid = False
                    logging.error(_ilm(
                        2,
                        ("Argument for Situation '%s' has an invalid"
                         " situationId (%s). Tool='%s'") %
                        (situation.uuid, argument.situationId,
                         situationSet.metadata.tool)))
                if (argument.entityId and
                        argument.entityId.uuidString not in
                        entity_uuidString_set):
                    valid = False
                    logging.error(_ilm(
                        2,
                        ("Argument for Situation '%s' has an invalid entityId"
                         " (%s). Tool='%s'") %
                        (situation.uuid, argument.entityId,
                         situationSet.metadata.tool)))
            for justification in lun(situation.justificationList):
                if (justification.mentionId.uuidString not in
                        situation_mention_uuidString_set):
                    valid = False
                    logging.error(_ilm(
                        2,
                        ("Justification for Situation '%s' has an invalid"
                         " [situation] mentionId (%s). Tool='%s'") %
                        (situation.uuid, justification.mentionId,
                         situationSet.metadata.tool)))
                if justification.tokenRefSeqList:
                    for tokenRefSeq in justification.tokenRefSeqList:
                        valid &= validate_token_ref_sequence(
                            comm, tokenRefSeq)
            for mentionId in lun(situation.mentionIdList):
                if (mentionId.uuidString not in
                        situation_mention_uuidString_set):
                    valid = False
                    logging.error(_ilm(
                        2,
                        ("Situation '%s' has an invalid [situation] mentionId"
                         " (%s). Tool='%s'") %
                        (situation.uuid, mentionId,
                         situationSet.metadata.tool)))
    return valid
예제 #3
0
def validate_situations(comm):
    valid = True

    entity_uuidString_set = _get_entity_uuidString_set(comm)
    situation_mention_uuidString_set = _get_situation_mention_uuidString_set(
        comm)
    situation_uuidString_set = _get_situation_uuidString_set(comm)

    for situationSet in lun(comm.situationSetList):
        for situation in lun(situationSet.situationList):
            for argument in lun(situation.argumentList):
                if (argument.situationId and argument.situationId.uuidString
                        not in situation_uuidString_set):
                    valid = False
                    logging.error(
                        _ilm(2, ("Argument for Situation '%s' has an invalid"
                                 " situationId (%s). Tool='%s'") %
                             (situation.uuid, argument.situationId,
                              situationSet.metadata.tool)))
                if (argument.entityId and argument.entityId.uuidString
                        not in entity_uuidString_set):
                    valid = False
                    logging.error(
                        _ilm(2, (
                            "Argument for Situation '%s' has an invalid entityId"
                            " (%s). Tool='%s'") %
                             (situation.uuid, argument.entityId,
                              situationSet.metadata.tool)))
            for justification in lun(situation.justificationList):
                if (justification.mentionId.uuidString
                        not in situation_mention_uuidString_set):
                    valid = False
                    logging.error(
                        _ilm(2,
                             ("Justification for Situation '%s' has an invalid"
                              " [situation] mentionId (%s). Tool='%s'") %
                             (situation.uuid, justification.mentionId,
                              situationSet.metadata.tool)))
                if justification.tokenRefSeqList:
                    for tokenRefSeq in justification.tokenRefSeqList:
                        valid &= validate_token_ref_sequence(comm, tokenRefSeq)
            for mentionId in lun(situation.mentionIdList):
                if (mentionId.uuidString
                        not in situation_mention_uuidString_set):
                    valid = False
                    logging.error(
                        _ilm(2, (
                            "Situation '%s' has an invalid [situation] mentionId"
                            " (%s). Tool='%s'") %
                             (situation.uuid, mentionId,
                              situationSet.metadata.tool)))
    return valid
예제 #4
0
def validate_entity_mention_tokenization_ids(comm):
    valid = True
    tokenization_uuidString_set = _get_tokenization_uuidString_set(comm)

    for entityMentionSet in lun(comm.entityMentionSetList):
        for entityMention in lun(entityMentionSet.mentionList):
            if (entityMention.tokens.tokenizationId.uuidString not in
                    tokenization_uuidString_set):
                valid = False
                logging.error(_ilm(
                    2,
                    "Mention '%s' has an invalid tokenizationId (%s)" %
                    (entityMention.uuid, entityMention.tokens.tokenizationId)))
    return valid
예제 #5
0
def validate_situation_mentions(comm):
    valid = True
    entity_mention_uuidString_set = _get_entity_mention_uuidString_set(comm)
    situation_mention_uuidString_set = _get_situation_mention_uuidString_set(
        comm)

    for situationMentionSet in lun(comm.situationMentionSetList):
        for situationMention in lun(situationMentionSet.mentionList):
            if situationMention.tokens:
                valid &= validate_token_ref_sequence(
                    comm, situationMention.tokens)
            for (m_idx, m_arg) in enumerate(situationMention.argumentList):
                if (m_arg.entityMentionId and
                        m_arg.entityMentionId.uuidString not in
                        entity_mention_uuidString_set):
                    valid = False
                    logging.error(_ilm(
                        2,
                        ("MentionArgument for SituationMention '%s' has an"
                         " invalid entityMentionId (%s). Tool='%s'") %
                        (situationMention.uuid.uuidString,
                         m_arg.entityMentionId,
                         situationMentionSet.metadata.tool)))
                if (m_arg.situationMentionId and
                        m_arg.situationMentionId.uuidString not in
                        situation_mention_uuidString_set):
                    valid = False
                    logging.error(_ilm(
                        2,
                        ("MentionArgument for SituationMention '%s' has an"
                         " invalid situationMentionId (%s). Tool='%s'") %
                        (situationMention.uuid,
                         m_arg.situationMentionId,
                         situationMentionSet.metadata.tool)))
                total_args = (
                    bool(m_arg.tokens) +
                    bool(m_arg.entityMentionId) +
                    bool(m_arg.situationMentionId)
                )
                if total_args != 1:
                    valid = False
                    logging.error(_ilm(
                        2,
                        ("MentionArgument #%d for SituationMention '%s'"
                         " should have exactly one EntityMention|"
                         "SituationMention|TokenRefSequence, but found %d") %
                        (m_idx, situationMention.uuid.uuidString,
                         total_args)))
    return valid
예제 #6
0
def _get_situation_uuidString_set(comm):
    """
    Args:

    - `comm` (`Communication`)

    Returns:

    - set of strings: uuidStrings for all Situations in the Communication
    """
    situation_uuidString_set = set()
    for situationSet in lun(comm.situationSetList):
        for situation in lun(situationSet.situationList):
            situation_uuidString_set.add(situation.uuid.uuidString)
    return situation_uuidString_set
예제 #7
0
def _get_entity_uuidString_set(comm):
    """
    Args:

    - `comm` (`Communication`)

    Returns:

    - set of strings: uuidStrings for all Entities in the Communication
    """
    entity_uuidString_set = set()
    for entitySet in lun(comm.entitySetList):
        for entity in lun(entitySet.entityList):
            entity_uuidString_set.add(entity.uuid.uuidString)
    return entity_uuidString_set
예제 #8
0
def _get_entity_uuidString_set(comm):
    """
    Args:

    - `comm` (`Communication`)

    Returns:

    - set of strings: uuidStrings for all Entities in the Communication
    """
    entity_uuidString_set = set()
    for entitySet in lun(comm.entitySetList):
        for entity in lun(entitySet.entityList):
            entity_uuidString_set.add(entity.uuid.uuidString)
    return entity_uuidString_set
예제 #9
0
def validate_entity_mention_tokenization_ids(comm):
    valid = True
    tokenization_uuidString_set = _get_tokenization_uuidString_set(comm)

    for entityMentionSet in lun(comm.entityMentionSetList):
        for entityMention in lun(entityMentionSet.mentionList):
            if (entityMention.tokens.tokenizationId.uuidString
                    not in tokenization_uuidString_set):
                valid = False
                logging.error(
                    _ilm(
                        2, "Mention '%s' has an invalid tokenizationId (%s)" %
                        (entityMention.uuid,
                         entityMention.tokens.tokenizationId)))
    return valid
예제 #10
0
def _get_situation_uuidString_set(comm):
    """
    Args:

    - `comm` (`Communication`)

    Returns:

    - set of strings: uuidStrings for all Situations in the Communication
    """
    situation_uuidString_set = set()
    for situationSet in lun(comm.situationSetList):
        for situation in lun(situationSet.situationList):
            situation_uuidString_set.add(situation.uuid.uuidString)
    return situation_uuidString_set
예제 #11
0
def validate_entity_mention_ids(comm):
    valid = True
    entity_mention_uuidString_set = _get_entity_mention_uuidString_set(comm)

    for entitySet in lun(comm.entitySetList):
        for entity in lun(entitySet.entityList):
            for entityMentionId in entity.mentionIdList:
                if (entityMentionId.uuidString not in
                        entity_mention_uuidString_set):
                    valid = False
                    logging.error(_ilm(
                        2,
                        "Entity '%s' has an invalid entityMentionId (%s)" %
                        (entity.uuid, entityMentionId)))
    return valid
예제 #12
0
def validate_situation_mentions(comm):
    valid = True
    entity_mention_uuidString_set = _get_entity_mention_uuidString_set(comm)
    situation_mention_uuidString_set = _get_situation_mention_uuidString_set(
        comm)

    for situationMentionSet in lun(comm.situationMentionSetList):
        for situationMention in lun(situationMentionSet.mentionList):
            if situationMention.tokens:
                valid &= validate_token_ref_sequence(comm,
                                                     situationMention.tokens)
            for (m_idx, m_arg) in enumerate(situationMention.argumentList):
                if (m_arg.entityMentionId and m_arg.entityMentionId.uuidString
                        not in entity_mention_uuidString_set):
                    valid = False
                    logging.error(
                        _ilm(
                            2,
                            ("MentionArgument for SituationMention '%s' has an"
                             " invalid entityMentionId (%s). Tool='%s'") %
                            (situationMention.uuid.uuidString,
                             m_arg.entityMentionId,
                             situationMentionSet.metadata.tool)))
                if (m_arg.situationMentionId
                        and m_arg.situationMentionId.uuidString
                        not in situation_mention_uuidString_set):
                    valid = False
                    logging.error(
                        _ilm(
                            2,
                            ("MentionArgument for SituationMention '%s' has an"
                             " invalid situationMentionId (%s). Tool='%s'") %
                            (situationMention.uuid, m_arg.situationMentionId,
                             situationMentionSet.metadata.tool)))
                total_args = (bool(m_arg.tokens) +
                              bool(m_arg.entityMentionId) +
                              bool(m_arg.situationMentionId))
                if total_args != 1:
                    valid = False
                    logging.error(
                        _ilm(
                            2,
                            ("MentionArgument #%d for SituationMention '%s'"
                             " should have exactly one EntityMention|"
                             "SituationMention|TokenRefSequence, but found %d")
                            % (m_idx, situationMention.uuid.uuidString,
                               total_args)))
    return valid
예제 #13
0
def validate_entity_mention_ids(comm):
    valid = True
    entity_mention_uuidString_set = _get_entity_mention_uuidString_set(comm)

    for entitySet in lun(comm.entitySetList):
        for entity in lun(entitySet.entityList):
            for entityMentionId in entity.mentionIdList:
                if (entityMentionId.uuidString
                        not in entity_mention_uuidString_set):
                    valid = False
                    logging.error(
                        _ilm(
                            2,
                            "Entity '%s' has an invalid entityMentionId (%s)" %
                            (entity.uuid, entityMentionId)))
    return valid
예제 #14
0
def _get_tokenization_uuidString_set(comm):
    """
    Args:

    - `comm` (`Communication`)

    Returns:

    - set of strings: uuidStrings for all Tokenizations in the Communication
    """
    tokenization_uuidString_set = set()
    for section in lun(comm.sectionList):
        for sentence in lun(section.sentenceList):
            if sentence.tokenization:
                tokenization_uuidString_set.add(
                    sentence.tokenization.uuid.uuidString)
    return tokenization_uuidString_set
예제 #15
0
def _get_tokenization_uuidString_set(comm):
    """
    Args:

    - `comm` (`Communication`)

    Returns:

    - set of strings: uuidStrings for all Tokenizations in the Communication
    """
    tokenization_uuidString_set = set()
    for section in lun(comm.sectionList):
        for sentence in lun(section.sentenceList):
            if sentence.tokenization:
                tokenization_uuidString_set.add(
                    sentence.tokenization.uuid.uuidString)
    return tokenization_uuidString_set
예제 #16
0
def _get_sentence_for_tokenization_uuidString_dict(comm):
    """
    Args:

    - `comm` (`Communication`)

    Returns:

    - dictionary mapping of Tokenization uuidStrings to Sentences
    """
    if not hasattr(comm, 'sentence_for_tokenization_uuidString_dict'):
        comm.sentence_for_tokenization_uuidString_dict = {}
        for section in lun(comm.sectionList):
            for sentence in lun(section.sentenceList):
                if sentence.tokenization:
                    comm.sentence_for_tokenization_uuidString_dict[
                        sentence.tokenization.uuid.uuidString] = sentence
    return comm.sentence_for_tokenization_uuidString_dict
예제 #17
0
def _get_sentence_for_tokenization_uuidString_dict(comm):
    """
    Args:

    - `comm` (`Communication`)

    Returns:

    - dictionary mapping of Tokenization uuidStrings to Sentences
    """
    if not hasattr(comm, 'sentence_for_tokenization_uuidString_dict'):
        comm.sentence_for_tokenization_uuidString_dict = {}
        for section in lun(comm.sectionList):
            for sentence in lun(section.sentenceList):
                if sentence.tokenization:
                    comm.sentence_for_tokenization_uuidString_dict[
                        sentence.tokenization.uuid.uuidString] = sentence
    return comm.sentence_for_tokenization_uuidString_dict
예제 #18
0
def print_communication_taggings_for_communication(comm, tool=None):
    communication_taggings = _filter_by_tool(
        lun(comm.communicationTaggingList), tool)
    for tagging in communication_taggings:
        print '%s: %s' % (
            tagging.taggingType,
            ' '.join('%s:%.3f' % p for p in
                     zip(tagging.tagList, tagging.confidenceList))
        )
예제 #19
0
def _get_tokenization_uuidString_dict(comm):
    """
    Args:

    - `comm` (`Communication`)

    Returns:

    - dictionary mapping uuidStrings to Tokenizations
    """
    if not hasattr(comm, '_tokenization_uuidString_dict'):
        comm._tokenization_uuidString_dict = {}
        for section in lun(comm.sectionList):
            for sentence in lun(section.sentenceList):
                tkzn = sentence.tokenization
                if tkzn:
                    u = tkzn.uuid.uuidString
                    comm._tokenization_uuidString_dict[u] = tkzn
    return comm._tokenization_uuidString_dict
예제 #20
0
def _get_tokenization_uuidString_dict(comm):
    """
    Args:

    - `comm` (`Communication`)

    Returns:

    - dictionary mapping uuidStrings to Tokenizations
    """
    if not hasattr(comm, '_tokenization_uuidString_dict'):
        comm._tokenization_uuidString_dict = {}
        for section in lun(comm.sectionList):
            for sentence in lun(section.sentenceList):
                tkzn = sentence.tokenization
                if tkzn:
                    u = tkzn.uuid.uuidString
                    comm._tokenization_uuidString_dict[u] = tkzn
    return comm._tokenization_uuidString_dict
예제 #21
0
def print_situations(comm, tool=None):
    """Print information for all Situations and their SituationMentions

    Args:

    - `comm`: A Concrete Communication
    """
    for s_set_idx, s_set in enumerate(lun(comm.situationSetList)):
        if tool is None or s_set.metadata.tool == tool:
            print u"Situation Set %d (%s):" % (s_set_idx,
                                               s_set.metadata.tool)
            for s_idx, situation in enumerate(s_set.situationList):
                print u"  Situation %d-%d:" % (s_set_idx, s_idx)
                _p(6, 18, u"situationType", situation.situationType)
                for sm_idx, sm in enumerate(lun(situation.mentionList)):
                    print u" " * 6 + u"SituationMention %d-%d-%d:" % (
                        s_set_idx, s_idx, sm_idx)
                    _print_situation_mention(sm)
                print
            print
예제 #22
0
def print_situations(comm):
    """Print information for all Situations and their SituationMentions

    Args:

    - `comm`: A Concrete Communication
    """
    for s_set_idx, s_set in enumerate(lun(comm.situationSetList)):
        if s_set.metadata:
            print u"Situation Set %d (%s):" % (s_set_idx, s_set.metadata.tool)
        else:
            print u"Situation Set %d:" % s_set_idx
        for s_idx, situation in enumerate(s_set.situationList):
            print u"  Situation %d-%d:" % (s_set_idx, s_idx)
            _p(6, 18, u"situationType", situation.situationType)
            for sm_idx, sm in enumerate(lun(situation.mentionList)):
                print u" " * 6 + u"SituationMention %d-%d-%d:" % (
                    s_set_idx, s_idx, sm_idx)
                _print_situation_mention(sm)
            print
        print
예제 #23
0
def print_situation_mentions(comm, tool=None):
    """Print information for all SituationMentions (some of which may
    not have Situations)

    Args:

    - `comm`: A Concrete Communication
    """
    for sm_set_idx, sm_set in enumerate(lun(comm.situationMentionSetList)):
        if tool is None or sm_set.metadata.tool == tool:
            print u"Situation Set %d (%s):" % (sm_set_idx,
                                               sm_set.metadata.tool)
            for sm_idx, sm in enumerate(sm_set.mentionList):
                print u"  SituationMention %d-%d:" % (sm_set_idx, sm_idx)
                _print_situation_mention(sm)
                print
            print
예제 #24
0
def validate_communication(comm):
    """
    Args:

    - `comm` (`Communication`)

    Returns:

    - `True` if Communication is valid, `False` otherwise
    """
    valid = True

    logging.info(_ilm(0, "Validating Communication with ID '%s'" % comm.id))

    valid &= validate_thrift_deep(comm)

    for section in lun(comm.sectionList):
        valid &= validate_token_offsets_for_section(section)
        if section.sentenceList:
            logging.debug(
                _ilm(
                    4, "section '%s' has %d sentences" %
                    (section.uuid, len(section.sentenceList))))
            for sentence in section.sentenceList:
                valid &= validate_token_offsets_for_sentence(sentence)
                if sentence.tokenization:
                    valid &= validate_constituency_parses(
                        comm, sentence.tokenization)
                    valid &= validate_dependency_parses(sentence.tokenization)
                    valid &= validate_token_taggings(sentence.tokenization)

    valid &= validate_entity_mention_ids(comm)
    valid &= validate_entity_mention_tokenization_ids(comm)
    valid &= validate_entity_mention_token_ref_sequences(comm)
    valid &= validate_situations(comm)
    valid &= validate_situation_mentions(comm)

    if not valid:
        logging.error(
            _ilm(0, "The Communication with ID '%s' IS NOT valid" % comm.id))
    else:
        logging.info(
            _ilm(0, "The Communication with ID '%s' is valid" % comm.id))

    return valid
예제 #25
0
def validate_communication(comm):
    """
    Args:

    - `comm` (`Communication`)

    Returns:

    - `True` if Communication is valid, `False` otherwise
    """
    valid = True

    logging.info(_ilm(0, "Validating Communication with ID '%s'" % comm.id))

    valid &= validate_thrift_deep(comm)

    for section in lun(comm.sectionList):
        valid &= validate_token_offsets_for_section(section)
        if section.sentenceList:
            logging.debug(_ilm(4, "section '%s' has %d sentences" %
                               (section.uuid, len(section.sentenceList))))
            for sentence in section.sentenceList:
                valid &= validate_token_offsets_for_sentence(sentence)
                if sentence.tokenization:
                    valid &= validate_constituency_parses(
                        comm, sentence.tokenization)
                    valid &= validate_dependency_parses(
                        sentence.tokenization)
                    valid &= validate_token_taggings(sentence.tokenization)

    valid &= validate_entity_mention_ids(comm)
    valid &= validate_entity_mention_tokenization_ids(comm)
    valid &= validate_entity_mention_token_ref_sequences(comm)
    valid &= validate_situations(comm)
    valid &= validate_situation_mentions(comm)

    if not valid:
        logging.error(
            _ilm(0, "The Communication with ID '%s' IS NOT valid" % comm.id))
    else:
        logging.info(
            _ilm(0, "The Communication with ID '%s' is valid" % comm.id))

    return valid
예제 #26
0
def validate_token_offsets_for_section(section):
    """
    Test if the TextSpan boundaries for all sentences in a section fall
    within the boundaries of the section's TextSpan
    """
    valid = True

    if section.textSpan is None:
        return valid

    if section.textSpan.start > section.textSpan.ending:
        valid = False
        logging.error(_ilm(
            2,
            ("Section '%s' has a TextSpan with a start offset (%d) > end"
             " offset (%d)") %
            (section.uuid, section.textSpan.start, section.textSpan.ending)))

    for sentence in lun(section.sentenceList):
        if sentence.textSpan is None:
            continue
        if sentence.textSpan.start > sentence.textSpan.ending:
            valid = False
            logging.error(_ilm(
                2,
                ("Sentence '%s' has a TextSpan with a start offset (%d) > end"
                 " offset (%d)") %
                (sentence.uuid, sentence.textSpan.start,
                 sentence.textSpan.ending)))
        elif ((sentence.textSpan.start < section.textSpan.start) or
                (sentence.textSpan.start > section.textSpan.ending) or
                (sentence.textSpan.ending < section.textSpan.start) or
                (sentence.textSpan.ending > section.textSpan.ending)):
            valid = False
            logging.error(_ilm(
                2,
                ("Sentence '%s' in Section '%s' has a TextSpan [%d, %d] that"
                 " does not fit within the Section TextSpan [%d, %d]") %
                (sentence.uuid, section.uuid, sentence.textSpan.start,
                 sentence.textSpan.ending, section.textSpan.start,
                 section.textSpan.ending)))

    return valid
예제 #27
0
def print_sections(comm):
    """Print information for all Sections, according to their spans.

    Args:

    - `comm`: A Concrete Communication
    """
    text = comm.text
    for sect_idx, sect in enumerate(lun(comm.sectionList)):
        ts = sect.textSpan
        if ts is None:
            print u"Section %s does not have a textSpan "
            "field set" % (sect.uuid.uuidString)
            continue
        print u"Section %d (%s), from %d to %d:" % (
            sect_idx, sect.uuid.uuidString, ts.start, ts.ending)
        print u"%s" % (text[ts.start:ts.ending])
        print
    print
예제 #28
0
def print_situation_mentions(comm):
    """Print information for all SituationMentions (some of which may
    not have Situations)

    Args:

    - `comm`: A Concrete Communication
    """
    for sm_set_idx, sm_set in enumerate(lun(comm.situationMentionSetList)):
        if sm_set.metadata:
            print u"Situation Set %d (%s):" % (sm_set_idx,
                                               sm_set.metadata.tool)
        else:
            print u"Situation Set %d:" % sm_set_idx
        for sm_idx, sm in enumerate(sm_set.mentionList):
            print u"  SituationMention %d-%d:" % (sm_set_idx, sm_idx)
            _print_situation_mention(sm)
            print
        print
예제 #29
0
def validate_token_offsets_for_section(section):
    """
    Test if the TextSpan boundaries for all sentences in a section fall
    within the boundaries of the section's TextSpan
    """
    valid = True

    if section.textSpan is None:
        return valid

    if section.textSpan.start > section.textSpan.ending:
        valid = False
        logging.error(
            _ilm(2,
                 ("Section '%s' has a TextSpan with a start offset (%d) > end"
                  " offset (%d)") % (section.uuid, section.textSpan.start,
                                     section.textSpan.ending)))

    for sentence in lun(section.sentenceList):
        if sentence.textSpan is None:
            continue
        if sentence.textSpan.start > sentence.textSpan.ending:
            valid = False
            logging.error(
                _ilm(2, (
                    "Sentence '%s' has a TextSpan with a start offset (%d) > end"
                    " offset (%d)") % (sentence.uuid, sentence.textSpan.start,
                                       sentence.textSpan.ending)))
        elif ((sentence.textSpan.start < section.textSpan.start)
              or (sentence.textSpan.start > section.textSpan.ending)
              or (sentence.textSpan.ending < section.textSpan.start)
              or (sentence.textSpan.ending > section.textSpan.ending)):
            valid = False
            logging.error(
                _ilm(2, (
                    "Sentence '%s' in Section '%s' has a TextSpan [%d, %d] that"
                    " does not fit within the Section TextSpan [%d, %d]") %
                     (sentence.uuid, section.uuid, sentence.textSpan.start,
                      sentence.textSpan.ending, section.textSpan.start,
                      section.textSpan.ending)))

    return valid
예제 #30
0
def print_sections(comm, tool=None):
    """Print information for all Sections, according to their spans.

    Args:

    - `comm`: A Concrete Communication
    """
    if tool is None or comm.metadata.tool == tool:
        text = comm.text
        for sect_idx, sect in enumerate(lun(comm.sectionList)):
            ts = sect.textSpan
            if ts is None:
                print u"Section %s does not have a textSpan "
                "field set" % (sect.uuid.uuidString)
                continue
            print u"Section %d (%s), from %d to %d:" % (
                sect_idx, sect.uuid.uuidString, ts.start, ts.ending)
            print u"%s" % (text[ts.start:ts.ending])
            print
        print
예제 #31
0
def get_entityMentions_by_tokenizationId(comm):
    """Get entity mentions for a Communication grouped by Tokenization
    UUID string

    Args:

    - `comm`: A Concrete Communication object

    Returns:

    - A dictionary of lists of EntityMentions, where the dictionary
      keys are Tokenization UUID strings.
    """
    mentions_by_tkzn_id = defaultdict(list)
    for entitySet in lun(comm.entitySetList):
        for entity in entitySet.entityList:
            for entityMention in entity.mentionList:
                u = entityMention.tokens.tokenizationId.uuidString
                mentions_by_tkzn_id[u].append(entityMention)
    return mentions_by_tkzn_id
예제 #32
0
def _get_entityMentions_by_tokenizationId(comm, tool=None):
    """Get entity mentions for a Communication grouped by Tokenization
    UUID string

    Args:

    - `comm`: A Concrete Communication object

    Returns:

    - A dictionary of lists of EntityMentions, where the dictionary
      keys are Tokenization UUID strings.
    """
    mentions_by_tkzn_id = defaultdict(list)
    for entitySet in lun(comm.entitySetList):
        for entity in entitySet.entityList:
            for entityMention in entity.mentionList:
                if (tool is None or
                        entityMention.entityMentionSet.metadata.tool == tool):
                    u = entityMention.tokens.tokenizationId.uuidString
                    mentions_by_tkzn_id[u].append(entityMention)
    return mentions_by_tkzn_id
예제 #33
0
def _print_situation_mention(situationMention):
    """Helper function for printing info for a SituationMention"""
    if situationMention.text:
        _p(10, 20, u"text", situationMention.text)
    if situationMention.situationType:
        _p(10, 20, u"situationType", situationMention.situationType)
    for arg_idx, ma in enumerate(lun(situationMention.argumentList)):
        print u" " * 10 + u"Argument %d:" % arg_idx
        if ma.role:
            _p(14, 16, u"role", ma.role)
        if ma.entityMention:
            _p(14, 16, u"entityMention",
                u" ".join(_get_tokens_for_entityMention(ma.entityMention)))
        # A SituationMention can have an argumentList with a
        # MentionArgument that points to another SituationMention---
        # which could conceivably lead to loops.  We currently don't
        # traverse the list recursively, instead looking at only
        # SituationMentions referenced by top-level SituationMentions
        if ma.situationMention:
            print u" " * 14 + u"situationMention:"
            if situationMention.text:
                _p(18, 20, u"text", situationMention.text)
            if situationMention.situationType:
                _p(18, 20, u"situationType", situationMention.situationType)
예제 #34
0
def _print_situation_mention(situationMention):
    """Helper function for printing info for a SituationMention"""
    if situationMention.text:
        _p(10, 20, u"text", situationMention.text)
    if situationMention.situationType:
        _p(10, 20, u"situationType", situationMention.situationType)
    for arg_idx, ma in enumerate(lun(situationMention.argumentList)):
        print u" " * 10 + u"Argument %d:" % arg_idx
        if ma.role:
            _p(14, 16, u"role", ma.role)
        if ma.entityMention:
            _p(14, 16, u"entityMention",
               u" ".join(get_tokens_for_entityMention(ma.entityMention)))
        # A SituationMention can have an argumentList with a
        # MentionArgument that points to another SituationMention---
        # which could conceivably lead to loops.  We currently don't
        # traverse the list recursively, instead looking at only
        # SituationMentions referenced by top-level SituationMentions
        if ma.situationMention:
            print u" " * 14 + u"situationMention:"
            if situationMention.text:
                _p(18, 20, u"text", situationMention.text)
            if situationMention.situationType:
                _p(18, 20, u"situationType", situationMention.situationType)
예제 #35
0
def get_comm_tokenizations(comm, tool=None):
    for section in lun(comm.sectionList):
        for sentence in lun(section.sentenceList):
            if tool is None or sentence.tokenization.metadata.tool == tool:
                yield sentence.tokenization
예제 #36
0
def validate_entity_mention_token_ref_sequences(comm):
    valid = True
    for entityMentionSet in lun(comm.entityMentionSetList):
        for entityMention in lun(entityMentionSet.mentionList):
            valid &= validate_token_ref_sequence(comm, entityMention.tokens)
    return valid
예제 #37
0
def print_metadata(comm, tool=None):
    """Print metadata for tools used to annotate Communication
    """
    def _get_tokenizations(comm):
        tokenizations = []
        if comm.sectionList:
            for section in comm.sectionList:
                if section.sentenceList:
                    for sentence in section.sentenceList:
                        if sentence.tokenization:
                            tokenizations.append(sentence.tokenization)
        return tokenizations

    if tool is None or comm.metadata.tool == tool:
        print u"Communication:  %s\n" % comm.metadata.tool

    dependency_parse_tools = set()
    parse_tools = set()
    tokenization_tools = set()
    token_tagging_tools = set()
    for tokenization in _get_tokenizations(comm):
        tokenization_tools.add(tokenization.metadata.tool)
        if tokenization.tokenTaggingList:
            for tokenTagging in tokenization.tokenTaggingList:
                token_tagging_tools.add(tokenTagging.metadata.tool)
        if tokenization.dependencyParseList:
            for dependencyParse in tokenization.dependencyParseList:
                dependency_parse_tools.add(dependencyParse.metadata.tool)
        if tokenization.parseList:
            for parse in tokenization.parseList:
                parse_tools.add(parse.metadata.tool)

    communication_tagging_tools = set()
    for communication_tagging in lun(comm.communicationTaggingList):
        communication_tagging_tools.add(communication_tagging.metadata.tool)

    if tool is not None:
        dependency_parse_tools = dependency_parse_tools.intersection([tool])
        parse_tools = parse_tools.intersection([tool])
        tokenization_tools = tokenization_tools.intersection([tool])
        token_tagging_tools = token_tagging_tools.intersection([tool])
        communication_tagging_tools = communication_tagging_tools.intersection(
            [tool])

    if tokenization_tools:
        for toolname in sorted(tokenization_tools):
            print u"  Tokenization:  %s" % toolname
        print
    if dependency_parse_tools:
        for toolname in sorted(dependency_parse_tools):
            print u"    Dependency Parse:  %s" % toolname
        print
    if parse_tools:
        for toolname in sorted(parse_tools):
            print u"    Parse:  %s" % toolname
        print
    if token_tagging_tools:
        for toolname in sorted(token_tagging_tools):
            print u"    TokenTagging:  %s" % toolname
        print

    if comm.entityMentionSetList:
        for i, em_set in enumerate(comm.entityMentionSetList):
            if tool is None or em_set.metadata.tool == tool:
                print u"  EntityMentionSet #%d:  %s" % (
                    i, em_set.metadata.tool)
        print
    if comm.entitySetList:
        for i, entitySet in enumerate(comm.entitySetList):
            if tool is None or entitySet.metadata.tool == tool:
                print u"  EntitySet #%d:  %s" % (
                    i, entitySet.metadata.tool)
        print
    if comm.situationMentionSetList:
        for i, sm_set in enumerate(comm.situationMentionSetList):
            if tool is None or sm_set.metadata.tool == tool:
                print u"  SituationMentionSet #%d:  %s" % (
                    i, sm_set.metadata.tool)
        print
    if comm.situationSetList:
        for i, situationSet in enumerate(comm.situationSetList):
            if tool is None or situationSet.metadata.tool == tool:
                print u"  SituationSet #%d:  %s" % (
                    i, situationSet.metadata.tool)
        print

    if communication_tagging_tools:
        for toolname in sorted(communication_tagging_tools):
            print u"  CommunicationTagging:  %s" % toolname
        print