Python AudioToken.begin 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: preprocessing.tokens

클래스/타입: AudioToken

메소드/함수: begin

hotexamples.com에서의 예제들: 4

Python AudioToken.begin - 4개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 preprocessing.tokens.AudioToken.begin에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

AudioToken(2)

begin(2)

duration(2)

자주 사용되는 메소드들

AudioToken (2)

begin (2)

duration (2)

예제 #1

파일 보기

파일: audio.py 프로젝트: yangyaoyunshu/sentence-boundary-detection-nn

    def tokenize(self):
        sentence_str = ' '.join(map(lambda t: t.word, self.tokens))

        raw_tokens = nltk.word_tokenize(sentence_str)

        final_tokens = []

        raw_idx = 0
        for idx in range(len(self.tokens)):
            if len(self.tokens[idx].word) == len(raw_tokens[raw_idx]):
                final_tokens.append(self.tokens[idx])
                raw_idx += 1
            else:
                # we need to split up our token
                current_index = raw_idx + 2
                while ''.join(raw_tokens[raw_idx:current_index]
                              ) != self.tokens[idx].word:
                    print(raw_tokens[raw_idx:current_index],
                          self.tokens[idx].word)
                    current_index += 1

                nr_tokens = current_index - raw_idx

                current_begin = self.tokens[idx].begin
                duration_step = self.tokens[idx].duration / nr_tokens

                for i in range(raw_idx, current_index):
                    new_token = AudioToken(raw_tokens[i])
                    new_token.duration = duration_step
                    new_token.begin = current_begin

                    current_begin += duration_step

                    final_tokens.append(new_token)

                raw_idx += nr_tokens

        assert (idx == len(self.tokens) - 1)
        assert (raw_idx == len(raw_tokens))

        final_tokens.append(PunctuationToken(".", Punctuation.PERIOD))
        self.tokens = final_tokens

예제 #2

파일 보기

파일: audio.py 프로젝트: anukat2015/sentence-boundary-detection-nn

    def tokenize(self):
        sentence_str = ' '.join(map(lambda t: t.word, self.tokens))

        raw_tokens = nltk.word_tokenize(sentence_str)

        final_tokens = []

        raw_idx = 0
        for idx in range(len(self.tokens)):
            if len(self.tokens[idx].word) == len(raw_tokens[raw_idx]):
                final_tokens.append(self.tokens[idx])
                raw_idx += 1
            else:
                # we need to split up our token
                current_index = raw_idx + 2
                while ''.join(raw_tokens[raw_idx:current_index]) != self.tokens[idx].word:
                    print(raw_tokens[raw_idx:current_index], self.tokens[idx].word)
                    current_index += 1

                nr_tokens = current_index - raw_idx

                current_begin = self.tokens[idx].begin
                duration_step = self.tokens[idx].duration / nr_tokens

                for i in range(raw_idx, current_index):
                    new_token = AudioToken(raw_tokens[i])
                    new_token.duration = duration_step
                    new_token.begin = current_begin

                    current_begin += duration_step

                    final_tokens.append(new_token)

                raw_idx += nr_tokens

        assert(idx == len(self.tokens) - 1)
        assert(raw_idx == len(raw_tokens))

        final_tokens.append(PunctuationToken(".", Punctuation.PERIOD))
        self.tokens = final_tokens

예제 #3

파일 보기

파일: ctm_parser.py 프로젝트: knub/sentence-boundary-detection-nn

    def parse(self):
        current_talk_id = 0
        audio = Audio()
        sentence = AudioSentence()
        sentence.tokens = []

        group_name = self._extract_group_name()

        with open(self.filename, "r") as file_:
            for line_unenc in file_:
                self._progress += 1

                # parse line
                line = unicode(line_unenc, errors="ignore")
                line = line.rstrip()

                if line.startswith("#"):
                    talk_id = self._extract_talk_id(line)
                    token_count = len(sentence.tokens)

                    # end of sentence reached
                    if token_count > 0:
                        sentence.begin = sentence.tokens[0].begin
                        sentence.end = sentence.tokens[-1].begin + sentence.tokens[-1].duration
                        sentence.tokenize()
                        sentence.prepare()
                        audio.add_sentence(sentence)

                    # end of talk reached
                    if talk_id != current_talk_id:
                        if token_count > 0:
                            # save audio talk
                            audio.talk_id = current_talk_id
                            audio.group_name = group_name
                            audio = self._prepare_audio(audio)
                            yield audio
                            audio = Audio()
                            current_talk_id = talk_id
                            continue
                        else:
                            current_talk_id = talk_id

                    # begin a new sentence
                    sentence = AudioSentence()
                    sentence.tokens = []

                else:
                    # parse line
                    line_parts = re.split(" +", line)
                    begin = float(line_parts[2])
                    duration = float(line_parts[3])
                    word = line_parts[4]

                    # add token to sentence
                    token = AudioToken(word.lower())
                    token.begin = begin
                    token.duration = duration

                    sentence.append_token(token)

        if len(sentence.tokens) > 0:
            sentence.begin = sentence.tokens[0].begin
            sentence.end = sentence.tokens[-1].begin + sentence.tokens[-1].duration
            sentence.tokenize()
            sentence.prepare()
            audio.add_sentence(sentence)

        if len(audio.sentences) > 0:
            audio.talk_id = current_talk_id
            audio.group_name = group_name
            audio = self._prepare_audio(audio)
            yield audio

예제 #4

파일 보기

    def parse(self):
        current_talk_id = 0
        audio = Audio()
        sentence = AudioSentence()
        sentence.tokens = []

        group_name = self._extract_group_name()

        with open(self.filename, "r") as file_:
            for line_unenc in file_:
                self._progress += 1

                # parse line
                line = unicode(line_unenc, errors='ignore')
                line = line.rstrip()

                if line.startswith("#"):
                    talk_id = self._extract_talk_id(line)
                    token_count = len(sentence.tokens)

                    # end of sentence reached
                    if token_count > 0:
                        sentence.begin = sentence.tokens[0].begin
                        sentence.end = sentence.tokens[
                            -1].begin + sentence.tokens[-1].duration
                        sentence.tokenize()
                        sentence.prepare()
                        audio.add_sentence(sentence)

                    # end of talk reached
                    if talk_id != current_talk_id:
                        if token_count > 0:
                            # save audio talk
                            audio.talk_id = current_talk_id
                            audio.group_name = group_name
                            audio = self._prepare_audio(audio)
                            yield audio
                            audio = Audio()
                            current_talk_id = talk_id
                            continue
                        else:
                            current_talk_id = talk_id

                    # begin a new sentence
                    sentence = AudioSentence()
                    sentence.tokens = []

                else:
                    # parse line
                    line_parts = re.split(" +", line)
                    begin = float(line_parts[2])
                    duration = float(line_parts[3])
                    word = line_parts[4]

                    # add token to sentence
                    token = AudioToken(word.lower())
                    token.begin = begin
                    token.duration = duration

                    sentence.append_token(token)

        if (len(sentence.tokens) > 0):
            sentence.begin = sentence.tokens[0].begin
            sentence.end = sentence.tokens[-1].begin + sentence.tokens[
                -1].duration
            sentence.tokenize()
            sentence.prepare()
            audio.add_sentence(sentence)

        if len(audio.sentences) > 0:
            audio.talk_id = current_talk_id
            audio.group_name = group_name
            audio = self._prepare_audio(audio)
            yield audio