예제 #1
0
def entities2es(event_xml, entity_class, timestamp, es, index_name, doc_type):
    events = event_xml.find_all('event')
    event = events[0]
    event_id = event.attrs.get('xml:id')

    entities = {}
    for elem in event.descendants:
        if entity(elem) and not note(elem.parent.parent.parent):
            ent_class = '{}-'.format(entity_class)
            if elem.get('class').startswith(ent_class):
                # remove the entity class from the label name (to improve
                # readability)
                entity_name = elem.get('class').replace(ent_class, '')

                # remove "Level1:" (etc.) from tag names like
                # Level1:Lichaamswerking
                parts = entity_name.split(':')
                if len(parts) > 1:
                    entity_name = parts[1]

                if not entities.get(entity_name):
                    entities[entity_name] = []

                # get the text content of all the words that make up this
                # entity
                words = [w.get('t') for w in elem.find_all('wref')]
                entities[entity_name].append(' '.join(words).lower())

    # only add entities if there are entities to be added
    if entities:
        doc = {
            '{}-entities'.format(entity_class): {
                'data': entities,
                'timestamp': timestamp
            }
        }

        es.update(index=index_name,
                  doc_type=type_name,
                  id=event_id,
                  body={'doc': doc})
예제 #2
0
def event2es(event_xml, event_order, es, index_name, type_name):

    events = event_xml.find_all('event')
    event = events[0]
    event_id = event.attrs.get('xml:id')

    if not es.exists(index=index_name, doc_type=type_name, id=event_id):
        play_id = xml_id2play_id(event_id)

        cls = event.attrs.get('class')
        if cls == 'speakerturn':
            actor = extract_character_name(event.attrs.get('actor'))

        text = []
        for elem in event.descendants:
            if sentence(elem) and not note(elem.parent):
                text.append(elem.t.string)

        num_words = 0
        text_ascii = ' '.join(text).encode('ascii', 'ignore')
        # prevent empty string to be send to the analyzer
        if text_ascii and not text_ascii.isspace():
            ws = es.indices.analyze(index=index_name,
                                    body=text_ascii,
                                    analyzer='standard').get('tokens')
            num_words = len(ws)

        doc = {
            'event_id': event_id,
            'text_id': play_id,
            'event_class': cls,
            'order': event_order,
            'text': ' '.join(text),
            'num_words': num_words
        }
        if cls == 'speakerturn':
            doc['actor'] = actor

        # create document if it does not yet exist
        es.create(index_name, type_name, doc)
예제 #3
0
def act2text(act_xml):
    """Extract text from act.
    Returns a string that can be written to file.
    """
    text = []

    print 'act:', act_xml.find('div', 'act').attrs.get('xml:id')

    subacts = act_xml.find_all(act)

    # act_xml should contain exactly one act; if it contains more acts, these
    # acts are sub acts, that will be processed later
    if len(subacts) == 1:
        for elem in act_xml.descendants:
            if sentence(elem) and not note(elem.parent):
                # some t elements appear to be empty (this is not allowed, but
                # it happens). So, check whether there is a string to add
                # before adding it.
                if elem.t:
                    if elem.t.string:
                        text.append(elem.t.string)
    return text
예제 #4
0
            liwc_count[cat] = 0

        text_id = f[-20:-7]

        fi = open(f)

        context = etree.iterparse(fi, events=('end',), tag=act_tag, huge_tree=True)
        for event, elem in context:
            #print elem.attrib
            if elem.get('class') == 'act':
                # load div into memory
                div_xml = BeautifulSoup(etree.tostring(elem), 'xml')
                sentences = div_xml.find_all(sentence)
                s = None
                for sent in sentences:
                    if not note(sent.parent):
                        sent_id = sent.attrs.get('xml:id')

                        sent_words = [w.t.string.lower()
                                      for w in sent.find_all(word)]
                        for w in sent_words:
                            if w not in string.punctuation:
                                num_words += 1
                            if w in liwc_dict.keys():
                                #print w
                                for cat in liwc_dict[w]:
                                    liwc_count[liwc_categories[cat]] += 1
        result.loc[text_id] = pd.Series(liwc_count)
        result.set_value(text_id, '#words', num_words)
    print result
    result.to_csv(args.out_file)