def transform(self, corpus: Corpus) -> Corpus: """Computes the average number of questions asked in a conversation :param corpus: the corpus to compute features for. :type corpus: Corpus """ if self.verbose: print("Finding questions per utterance") questions = [] allutterids = corpus.get_utterance_ids() for i in list(range(0, len(allutterids))): utter_id = allutterids[i] text = corpus.get_utterance(utter_id).text nquestions = len(re.findall(r'\?+', text)) questions.append( nquestions) #gives number of questions in each utterance if self.verbose: print("Finding questions per conversation") allconvoids = corpus.get_conversation_ids() for i in list(range(0, len(allconvoids))): convo_id = allconvoids[i] convo_utters = corpus.get_conversation(convo_id)._utterance_ids avgquestion = np.mean( np.asarray(questions)[np.asarray(convo_utters)]) corpus.get_conversation(convo_id)._meta[ self.ATTR_NAME] = avgquestion #adds average questions per conversation to conversation metadata return corpus
def transform(self, corpus: Corpus) -> Corpus: """Computes the count of pause and hesitancy words for each utterance, then aggregates them for each conversation :param corpus: the corpus to compute features for. :type corpus: Corpus """ if self.verbose: print("Finding counts of pause and hesitancy words...") pause_words = [ 'um', 'umm', 'ummm', 'uh', 'uhh', 'uhhh', 'hm', 'hmm', 'hmmm', 'er', 'err', 'uh huh', 'huh', 'mhm', 'mhmm', 'erm', '...', 'ah', 'ahh', 'ahem', 'eh', 'ehh', 'ehhh', 'meh' ] hesitant_words = [ 'maybe', 'not', 'sure', 'unsure', 'probably', 'well', 'okay', 'like', 'actually', 'basically', 'seriously', 'totally', 'literally', 'know', 'mean', 'guess', 'suppose', 'but', 'something', 'so', 'wow', 'just', 'really', 'later', 'wait', 'future', 'almost', 'slightly', 'perhaps', 'somehow', 'sort', 'kind', 'little', 'somewhat', 'hey', 'alas', 'see', 'sounds', 'ok', 'roughly', 'why', 'how', 'yep', 'yup', 'may', 'possibly', 'might', 'could', 'doubt', 'skeptical', 'don\'t', 'won\'t', 'nah' ] pause = [] hesitancy = [] allutterids = corpus.get_utterance_ids() for i in list(range(0, len(allutterids))): utter_id = allutterids[i] text = corpus.get_utterance(utter_id).text textcleaned = "".join( c for c in text if c not in ('!', '.', ':', '?', '\'', ',', '\"', '@', '#', '$', '%', '^', '&', '*', '(', ')', '-', '~', '`', '_', '+', '=', '>', '<', '[', ']', '{', '}')) textlist = textcleaned.split() npause = len([i for i in textlist if i in pause_words]) nhesitant = len([i for i in textlist if i in hesitant_words]) pause.append( npause) #gives number of pause words in each utterance hesitancy.append( nhesitant) #gives number of hesitant words in each utterance corpus.get_utterance(utter_id).meta[self.NAME1] = npause corpus.get_utterance(utter_id).meta[self.NAME2] = nhesitant allconvoids = corpus.get_conversation_ids() for i in list(range(0, len(allconvoids))): convo_id = allconvoids[i] convo_utters = corpus.get_conversation(convo_id)._utterance_ids avgpause = np.mean(np.asarray(pause)[np.asarray(convo_utters)]) avghesitancy = np.mean( np.asarray(hesitancy)[np.asarray(convo_utters)]) corpus.get_conversation(convo_id)._meta[self.NAME3] = avgpause corpus.get_conversation(convo_id)._meta[self.NAME4] = avghesitancy return corpus