def match_t_and_s(elements):
    """Check whether every element in elements has matching <t> and <s> tags.
    """
    elements_ok = True
    error_msg = []
    for elem in elements:
        t_found = False
        s_found = False
        for child in elem.children:
            if text_content(child):
                t_found = True
            if sentence(child):
                s_found = True
        if not t_found and s_found:
            elements_ok = False
            error_msg.append('<t> and <s> mismatch: {id_}' \
                             .format(id_=elem.get('xml:id')))
    return elements_ok, '\n'.join(error_msg)
Exemplo n.º 2
0
def event2es(event_xml, event_order, es, index_name, type_name):

    events = event_xml.find_all('event')
    event = events[0]
    event_id = event.attrs.get('xml:id')

    if not es.exists(index=index_name, doc_type=type_name, id=event_id):
        play_id = xml_id2play_id(event_id)

        cls = event.attrs.get('class')
        if cls == 'speakerturn':
            actor = extract_character_name(event.attrs.get('actor'))

        text = []
        for elem in event.descendants:
            if sentence(elem) and not note(elem.parent):
                text.append(elem.t.string)

        num_words = 0
        text_ascii = ' '.join(text).encode('ascii', 'ignore')
        # prevent empty string to be send to the analyzer
        if text_ascii and not text_ascii.isspace():
            ws = es.indices.analyze(index=index_name,
                                    body=text_ascii,
                                    analyzer='standard').get('tokens')
            num_words = len(ws)

        doc = {
            'event_id': event_id,
            'text_id': play_id,
            'event_class': cls,
            'order': event_order,
            'text': ' '.join(text),
            'num_words': num_words
        }
        if cls == 'speakerturn':
            doc['actor'] = actor

        # create document if it does not yet exist
        es.create(index_name, type_name, doc)
Exemplo n.º 3
0
def act2text(act_xml):
    """Extract text from act.
    Returns a string that can be written to file.
    """
    text = []

    print 'act:', act_xml.find('div', 'act').attrs.get('xml:id')

    subacts = act_xml.find_all(act)

    # act_xml should contain exactly one act; if it contains more acts, these
    # acts are sub acts, that will be processed later
    if len(subacts) == 1:
        for elem in act_xml.descendants:
            if sentence(elem) and not note(elem.parent):
                # some t elements appear to be empty (this is not allowed, but
                # it happens). So, check whether there is a string to add
                # before adding it.
                if elem.t:
                    if elem.t.string:
                        text.append(elem.t.string)
    return text