def entities2es(event_xml, entity_class, timestamp, es, index_name, doc_type): events = event_xml.find_all('event') event = events[0] event_id = event.attrs.get('xml:id') entities = {} for elem in event.descendants: if entity(elem) and not note(elem.parent.parent.parent): ent_class = '{}-'.format(entity_class) if elem.get('class').startswith(ent_class): # remove the entity class from the label name (to improve # readability) entity_name = elem.get('class').replace(ent_class, '') # remove "Level1:" (etc.) from tag names like # Level1:Lichaamswerking parts = entity_name.split(':') if len(parts) > 1: entity_name = parts[1] if not entities.get(entity_name): entities[entity_name] = [] # get the text content of all the words that make up this # entity words = [w.get('t') for w in elem.find_all('wref')] entities[entity_name].append(' '.join(words).lower()) # only add entities if there are entities to be added if entities: doc = { '{}-entities'.format(entity_class): { 'data': entities, 'timestamp': timestamp } } es.update(index=index_name, doc_type=type_name, id=event_id, body={'doc': doc})
def event2es(event_xml, event_order, es, index_name, type_name): events = event_xml.find_all('event') event = events[0] event_id = event.attrs.get('xml:id') if not es.exists(index=index_name, doc_type=type_name, id=event_id): play_id = xml_id2play_id(event_id) cls = event.attrs.get('class') if cls == 'speakerturn': actor = extract_character_name(event.attrs.get('actor')) text = [] for elem in event.descendants: if sentence(elem) and not note(elem.parent): text.append(elem.t.string) num_words = 0 text_ascii = ' '.join(text).encode('ascii', 'ignore') # prevent empty string to be send to the analyzer if text_ascii and not text_ascii.isspace(): ws = es.indices.analyze(index=index_name, body=text_ascii, analyzer='standard').get('tokens') num_words = len(ws) doc = { 'event_id': event_id, 'text_id': play_id, 'event_class': cls, 'order': event_order, 'text': ' '.join(text), 'num_words': num_words } if cls == 'speakerturn': doc['actor'] = actor # create document if it does not yet exist es.create(index_name, type_name, doc)
def act2text(act_xml): """Extract text from act. Returns a string that can be written to file. """ text = [] print 'act:', act_xml.find('div', 'act').attrs.get('xml:id') subacts = act_xml.find_all(act) # act_xml should contain exactly one act; if it contains more acts, these # acts are sub acts, that will be processed later if len(subacts) == 1: for elem in act_xml.descendants: if sentence(elem) and not note(elem.parent): # some t elements appear to be empty (this is not allowed, but # it happens). So, check whether there is a string to add # before adding it. if elem.t: if elem.t.string: text.append(elem.t.string) return text
liwc_count[cat] = 0 text_id = f[-20:-7] fi = open(f) context = etree.iterparse(fi, events=('end',), tag=act_tag, huge_tree=True) for event, elem in context: #print elem.attrib if elem.get('class') == 'act': # load div into memory div_xml = BeautifulSoup(etree.tostring(elem), 'xml') sentences = div_xml.find_all(sentence) s = None for sent in sentences: if not note(sent.parent): sent_id = sent.attrs.get('xml:id') sent_words = [w.t.string.lower() for w in sent.find_all(word)] for w in sent_words: if w not in string.punctuation: num_words += 1 if w in liwc_dict.keys(): #print w for cat in liwc_dict[w]: liwc_count[liwc_categories[cat]] += 1 result.loc[text_id] = pd.Series(liwc_count) result.set_value(text_id, '#words', num_words) print result result.to_csv(args.out_file)