def tokenize(self): sentence_str = ' '.join(map(lambda t: t.word, self.tokens)) raw_tokens = nltk.word_tokenize(sentence_str) final_tokens = [] raw_idx = 0 for idx in range(len(self.tokens)): if len(self.tokens[idx].word) == len(raw_tokens[raw_idx]): final_tokens.append(self.tokens[idx]) raw_idx += 1 else: # we need to split up our token current_index = raw_idx + 2 while ''.join(raw_tokens[raw_idx:current_index] ) != self.tokens[idx].word: print(raw_tokens[raw_idx:current_index], self.tokens[idx].word) current_index += 1 nr_tokens = current_index - raw_idx current_begin = self.tokens[idx].begin duration_step = self.tokens[idx].duration / nr_tokens for i in range(raw_idx, current_index): new_token = AudioToken(raw_tokens[i]) new_token.duration = duration_step new_token.begin = current_begin current_begin += duration_step final_tokens.append(new_token) raw_idx += nr_tokens assert (idx == len(self.tokens) - 1) assert (raw_idx == len(raw_tokens)) final_tokens.append(PunctuationToken(".", Punctuation.PERIOD)) self.tokens = final_tokens
def tokenize(self): sentence_str = ' '.join(map(lambda t: t.word, self.tokens)) raw_tokens = nltk.word_tokenize(sentence_str) final_tokens = [] raw_idx = 0 for idx in range(len(self.tokens)): if len(self.tokens[idx].word) == len(raw_tokens[raw_idx]): final_tokens.append(self.tokens[idx]) raw_idx += 1 else: # we need to split up our token current_index = raw_idx + 2 while ''.join(raw_tokens[raw_idx:current_index]) != self.tokens[idx].word: print(raw_tokens[raw_idx:current_index], self.tokens[idx].word) current_index += 1 nr_tokens = current_index - raw_idx current_begin = self.tokens[idx].begin duration_step = self.tokens[idx].duration / nr_tokens for i in range(raw_idx, current_index): new_token = AudioToken(raw_tokens[i]) new_token.duration = duration_step new_token.begin = current_begin current_begin += duration_step final_tokens.append(new_token) raw_idx += nr_tokens assert(idx == len(self.tokens) - 1) assert(raw_idx == len(raw_tokens)) final_tokens.append(PunctuationToken(".", Punctuation.PERIOD)) self.tokens = final_tokens
def parse(self): current_talk_id = 0 audio = Audio() sentence = AudioSentence() sentence.tokens = [] group_name = self._extract_group_name() with open(self.filename, "r") as file_: for line_unenc in file_: self._progress += 1 # parse line line = unicode(line_unenc, errors="ignore") line = line.rstrip() if line.startswith("#"): talk_id = self._extract_talk_id(line) token_count = len(sentence.tokens) # end of sentence reached if token_count > 0: sentence.begin = sentence.tokens[0].begin sentence.end = sentence.tokens[-1].begin + sentence.tokens[-1].duration sentence.tokenize() sentence.prepare() audio.add_sentence(sentence) # end of talk reached if talk_id != current_talk_id: if token_count > 0: # save audio talk audio.talk_id = current_talk_id audio.group_name = group_name audio = self._prepare_audio(audio) yield audio audio = Audio() current_talk_id = talk_id continue else: current_talk_id = talk_id # begin a new sentence sentence = AudioSentence() sentence.tokens = [] else: # parse line line_parts = re.split(" +", line) begin = float(line_parts[2]) duration = float(line_parts[3]) word = line_parts[4] # add token to sentence token = AudioToken(word.lower()) token.begin = begin token.duration = duration sentence.append_token(token) if len(sentence.tokens) > 0: sentence.begin = sentence.tokens[0].begin sentence.end = sentence.tokens[-1].begin + sentence.tokens[-1].duration sentence.tokenize() sentence.prepare() audio.add_sentence(sentence) if len(audio.sentences) > 0: audio.talk_id = current_talk_id audio.group_name = group_name audio = self._prepare_audio(audio) yield audio
def parse(self): current_talk_id = 0 audio = Audio() sentence = AudioSentence() sentence.tokens = [] group_name = self._extract_group_name() with open(self.filename, "r") as file_: for line_unenc in file_: self._progress += 1 # parse line line = unicode(line_unenc, errors='ignore') line = line.rstrip() if line.startswith("#"): talk_id = self._extract_talk_id(line) token_count = len(sentence.tokens) # end of sentence reached if token_count > 0: sentence.begin = sentence.tokens[0].begin sentence.end = sentence.tokens[ -1].begin + sentence.tokens[-1].duration sentence.tokenize() sentence.prepare() audio.add_sentence(sentence) # end of talk reached if talk_id != current_talk_id: if token_count > 0: # save audio talk audio.talk_id = current_talk_id audio.group_name = group_name audio = self._prepare_audio(audio) yield audio audio = Audio() current_talk_id = talk_id continue else: current_talk_id = talk_id # begin a new sentence sentence = AudioSentence() sentence.tokens = [] else: # parse line line_parts = re.split(" +", line) begin = float(line_parts[2]) duration = float(line_parts[3]) word = line_parts[4] # add token to sentence token = AudioToken(word.lower()) token.begin = begin token.duration = duration sentence.append_token(token) if (len(sentence.tokens) > 0): sentence.begin = sentence.tokens[0].begin sentence.end = sentence.tokens[-1].begin + sentence.tokens[ -1].duration sentence.tokenize() sentence.prepare() audio.add_sentence(sentence) if len(audio.sentences) > 0: audio.talk_id = current_talk_id audio.group_name = group_name audio = self._prepare_audio(audio) yield audio