def splitta(self, text): return sbd.sbd_text(self._splitta_model, text, do_tok=False)
next_speechblock_number = 1 speechblock_id = None speaker_id = None next_sentence_callback = None for e in doc.xpath('//sp | //sp/p | //milestone'): if e.tag == 'sp': speechblock_id = 'speechblocks:{0}/{1}'.format(interview_id.split(':',1)[1],next_speechblock_number) next_speechblock_number += 1 speaker_id = speaker_ids[e.get('who')] print 'RPUSH "{0}:speechblocks" "{1}"'.format(interview_id,speechblock_id) print 'RPUSH "{0}:speechblocks" "{1}"'.format(speaker_id,speechblock_id) elif e.tag == 'p': # milestones do not occur within a <p> sentences = sbd.sbd_text(sbd_model,' '.join(e.itertext()),False) for sentence_index, sentence_text in enumerate(sentences): # Milestone expects to be called at the very next sentence. sentence_id = 'sentences:{0}'.format(next_sentence_number) next_sentence_number += 1 print 'RPUSH "{0}:sentences" "{1}"'.format(interview_id,sentence_id) print 'RPUSH "{0}:sentences" "{1}"'.format(speechblock_id,sentence_id) print 'RPUSH "{0}:sentences" "{1}"'.format(speaker_id,sentence_id) # N.B. text encoding is UTF-8 # Double quotes must be escaped for Redis syntax sentence_text = sentence_text.replace('"','\\"').encode('utf-8') print 'HMSET "{0}" "text" "{1}" "index" "{2}" "speechblock" "{3}" "speaker" "{4}" "interview" "{5}"'.format(sentence_id, sentence_text, sentence_index, speechblock_id, speaker_id, interview_id) if next_sentence_callback is not None: next_sentence_callback(sentence_id) next_sentence_callback = None