def tokenize(self):
        sentence_str = ' '.join(map(lambda t: t.word, self.tokens))

        raw_tokens = nltk.word_tokenize(sentence_str)

        final_tokens = []

        raw_idx = 0
        for idx in range(len(self.tokens)):
            if len(self.tokens[idx].word) == len(raw_tokens[raw_idx]):
                final_tokens.append(self.tokens[idx])
                raw_idx += 1
            else:
                # we need to split up our token
                current_index = raw_idx + 2
                while ''.join(raw_tokens[raw_idx:current_index]
                              ) != self.tokens[idx].word:
                    print(raw_tokens[raw_idx:current_index],
                          self.tokens[idx].word)
                    current_index += 1

                nr_tokens = current_index - raw_idx

                current_begin = self.tokens[idx].begin
                duration_step = self.tokens[idx].duration / nr_tokens

                for i in range(raw_idx, current_index):
                    new_token = AudioToken(raw_tokens[i])
                    new_token.duration = duration_step
                    new_token.begin = current_begin

                    current_begin += duration_step

                    final_tokens.append(new_token)

                raw_idx += nr_tokens

        assert (idx == len(self.tokens) - 1)
        assert (raw_idx == len(raw_tokens))

        final_tokens.append(PunctuationToken(".", Punctuation.PERIOD))
        self.tokens = final_tokens
    def tokenize(self):
        sentence_str = ' '.join(map(lambda t: t.word, self.tokens))

        raw_tokens = nltk.word_tokenize(sentence_str)

        final_tokens = []

        raw_idx = 0
        for idx in range(len(self.tokens)):
            if len(self.tokens[idx].word) == len(raw_tokens[raw_idx]):
                final_tokens.append(self.tokens[idx])
                raw_idx += 1
            else:
                # we need to split up our token
                current_index = raw_idx + 2
                while ''.join(raw_tokens[raw_idx:current_index]) != self.tokens[idx].word:
                    print(raw_tokens[raw_idx:current_index], self.tokens[idx].word)
                    current_index += 1

                nr_tokens = current_index - raw_idx

                current_begin = self.tokens[idx].begin
                duration_step = self.tokens[idx].duration / nr_tokens

                for i in range(raw_idx, current_index):
                    new_token = AudioToken(raw_tokens[i])
                    new_token.duration = duration_step
                    new_token.begin = current_begin

                    current_begin += duration_step

                    final_tokens.append(new_token)

                raw_idx += nr_tokens

        assert(idx == len(self.tokens) - 1)
        assert(raw_idx == len(raw_tokens))

        final_tokens.append(PunctuationToken(".", Punctuation.PERIOD))
        self.tokens = final_tokens
    def parse(self):
        current_talk_id = 0
        audio = Audio()
        sentence = AudioSentence()
        sentence.tokens = []

        group_name = self._extract_group_name()

        with open(self.filename, "r") as file_:
            for line_unenc in file_:
                self._progress += 1

                # parse line
                line = unicode(line_unenc, errors="ignore")
                line = line.rstrip()

                if line.startswith("#"):
                    talk_id = self._extract_talk_id(line)
                    token_count = len(sentence.tokens)

                    # end of sentence reached
                    if token_count > 0:
                        sentence.begin = sentence.tokens[0].begin
                        sentence.end = sentence.tokens[-1].begin + sentence.tokens[-1].duration
                        sentence.tokenize()
                        sentence.prepare()
                        audio.add_sentence(sentence)

                    # end of talk reached
                    if talk_id != current_talk_id:
                        if token_count > 0:
                            # save audio talk
                            audio.talk_id = current_talk_id
                            audio.group_name = group_name
                            audio = self._prepare_audio(audio)
                            yield audio
                            audio = Audio()
                            current_talk_id = talk_id
                            continue
                        else:
                            current_talk_id = talk_id

                    # begin a new sentence
                    sentence = AudioSentence()
                    sentence.tokens = []

                else:
                    # parse line
                    line_parts = re.split(" +", line)
                    begin = float(line_parts[2])
                    duration = float(line_parts[3])
                    word = line_parts[4]

                    # add token to sentence
                    token = AudioToken(word.lower())
                    token.begin = begin
                    token.duration = duration

                    sentence.append_token(token)

        if len(sentence.tokens) > 0:
            sentence.begin = sentence.tokens[0].begin
            sentence.end = sentence.tokens[-1].begin + sentence.tokens[-1].duration
            sentence.tokenize()
            sentence.prepare()
            audio.add_sentence(sentence)

        if len(audio.sentences) > 0:
            audio.talk_id = current_talk_id
            audio.group_name = group_name
            audio = self._prepare_audio(audio)
            yield audio
Exemplo n.º 4
0
    def parse(self):
        current_talk_id = 0
        audio = Audio()
        sentence = AudioSentence()
        sentence.tokens = []

        group_name = self._extract_group_name()

        with open(self.filename, "r") as file_:
            for line_unenc in file_:
                self._progress += 1

                # parse line
                line = unicode(line_unenc, errors='ignore')
                line = line.rstrip()

                if line.startswith("#"):
                    talk_id = self._extract_talk_id(line)
                    token_count = len(sentence.tokens)

                    # end of sentence reached
                    if token_count > 0:
                        sentence.begin = sentence.tokens[0].begin
                        sentence.end = sentence.tokens[
                            -1].begin + sentence.tokens[-1].duration
                        sentence.tokenize()
                        sentence.prepare()
                        audio.add_sentence(sentence)

                    # end of talk reached
                    if talk_id != current_talk_id:
                        if token_count > 0:
                            # save audio talk
                            audio.talk_id = current_talk_id
                            audio.group_name = group_name
                            audio = self._prepare_audio(audio)
                            yield audio
                            audio = Audio()
                            current_talk_id = talk_id
                            continue
                        else:
                            current_talk_id = talk_id

                    # begin a new sentence
                    sentence = AudioSentence()
                    sentence.tokens = []

                else:
                    # parse line
                    line_parts = re.split(" +", line)
                    begin = float(line_parts[2])
                    duration = float(line_parts[3])
                    word = line_parts[4]

                    # add token to sentence
                    token = AudioToken(word.lower())
                    token.begin = begin
                    token.duration = duration

                    sentence.append_token(token)

        if (len(sentence.tokens) > 0):
            sentence.begin = sentence.tokens[0].begin
            sentence.end = sentence.tokens[-1].begin + sentence.tokens[
                -1].duration
            sentence.tokenize()
            sentence.prepare()
            audio.add_sentence(sentence)

        if len(audio.sentences) > 0:
            audio.talk_id = current_talk_id
            audio.group_name = group_name
            audio = self._prepare_audio(audio)
            yield audio