예제 #1
0
파일: text.py 프로젝트: DrDub/icsisumm
 def splitta(self, text):
     return sbd.sbd_text(self._splitta_model, text, do_tok=False)
예제 #2
0
    next_speechblock_number = 1

    speechblock_id = None
    speaker_id = None
    next_sentence_callback = None

    for e in doc.xpath('//sp | //sp/p | //milestone'):
        if e.tag == 'sp':
            speechblock_id = 'speechblocks:{0}/{1}'.format(interview_id.split(':',1)[1],next_speechblock_number)
            next_speechblock_number += 1
            speaker_id = speaker_ids[e.get('who')]
            print 'RPUSH "{0}:speechblocks" "{1}"'.format(interview_id,speechblock_id)
            print 'RPUSH "{0}:speechblocks" "{1}"'.format(speaker_id,speechblock_id)
        elif e.tag == 'p':
            # milestones do not occur within a <p>
            sentences = sbd.sbd_text(sbd_model,' '.join(e.itertext()),False)
            for sentence_index, sentence_text in enumerate(sentences):
                # Milestone expects to be called at the very next sentence.
                sentence_id = 'sentences:{0}'.format(next_sentence_number)
                next_sentence_number += 1
                print 'RPUSH "{0}:sentences" "{1}"'.format(interview_id,sentence_id)
                print 'RPUSH "{0}:sentences" "{1}"'.format(speechblock_id,sentence_id)
                print 'RPUSH "{0}:sentences" "{1}"'.format(speaker_id,sentence_id)
                # N.B. text encoding is UTF-8
                # Double quotes must be escaped for Redis syntax
                sentence_text = sentence_text.replace('"','\\"').encode('utf-8')
                print 'HMSET "{0}" "text" "{1}" "index" "{2}" "speechblock" "{3}" "speaker" "{4}" "interview" "{5}"'.format(sentence_id, sentence_text, sentence_index, speechblock_id, speaker_id, interview_id)
                if next_sentence_callback is not None:
                    next_sentence_callback(sentence_id)
                    next_sentence_callback = None
예제 #3
0
 def splitta(self, text):
     return sbd.sbd_text(self._splitta_model, text, do_tok=False)