def plot_freq(self, x=None): """ Uses matplotlib to graph the frequency distribution. :param x: """ if self.__freq_dist is None: self.__freq_dist = Util.get_freq_dist(self.__tokens) Util.plot_freq_dist(self.__freq_dist, x) return ''
def get_least_freq(self, x=None): """ Returns the x least frequent words with their frequencies, or all words with their frequencies if x is not specified. :param x: the number of least frequent words to return """ if self.__freq_dist is None: self.__freq_dist = Util.get_freq_dist(self.__tokens) most_common = self.__freq_dist.most_common() freq_dist = [] count = 0 for item in reversed(most_common): freq_dist.append(item) temp_freq = [] if x is None: freq_dist = freq_dist else: for item in freq_dist: if count < int(x): temp_freq.append(item) count += 1 freq_dist = temp_freq return freq_dist
def __init__(self, text): """ Creates a SPLAT Object. """ if os.path.exists(text): temp_text = "" temp_utts = [] self.__name = text try: for line in open(text, 'r'): temp_utts.append(line.strip()) temp_text += line.strip() + " " self.__splat = temp_text self.__utterances = temp_utts except IsADirectoryError: print("WARNING: '" + text + "' is a directory. It is being treated as a string.") self.__splat = text temp_utts = [] for line in text.split("\n"): temp_utts.append(line.strip()) self.__utterances = temp_utts elif type(text) == str: self.__name = text[0:20] self.__splat = text temp_utts = [] for line in text.split("\n"): temp_utts.append(line.strip()) self.__utterances = temp_utts else: raise ValueError("WARNING: SPLAT must be of type str or file.") self.__uttcount = len(self.__utterances) self.__sentences = CleanSentenizer().sentenize(self.__splat) if self.__sentences == []: self.__sentences = self.__utterances self.__sentcount = len(self.__sentences) self.__rawtokens = RawTokenizer().tokenize(self.__splat) self.__tokens = CleanTokenizer().tokenize(self.__splat) self.__rawtypes = Util.typify(self.__rawtokens) self.__types = Util.typify(self.__tokens) self.__wordcount = Util.wordcount(self.__rawtokens) self.__unique_wordcount = Util.wordcount(self.__types) self.__ttr = Util.type_token_ratio(self.__types, self.__tokens) self.__alu = round( float(self.__wordcount) / float(self.__uttcount), 4) if self.__uttcount != 0 else 0.0 self.__als = round( float(self.__wordcount) / float(self.__sentcount), 4) if self.__sentcount != 0 else 0.0 temp_dpu = Util.count_disfluencies(self.__utterances) self.__dpu = temp_dpu[0] self.__adpu = temp_dpu[1] temp_dps = Util.count_disfluencies(self.__sentences) self.__dps = temp_dps[0] self.__adps = temp_dps[1] self.__disfluencies = Util.total_disfluencies(self.__dpu)
def pronouns(self): """ Prints out the pronoun statistics nicely. """ if self.__pronouns == {}: self.__pronouns = Util.count_pronouns(self.tokens()) template = "{0:12} {1:^12} {2:50}" print(template.format("Pronoun", "Frequency", "Type")) for k, v in self.__pronouns.items(): print(template.format(k, v[0], v[1] + " " + v[2] + " (" + v[3] + ")")) return ''
def __init__(self, text): """ Creates a SPLAT Object. """ if os.path.isfile(text): temp_text = "" temp_utts = [] self.__name = text try: for line in open(text, 'r'): temp_utts.append(line.strip()) temp_text += line.strip() + " " self.__splat = temp_text self.__utterances = temp_utts except IsADirectoryError: print("WARNING: '" + text + "' is a directory. It is being treated as a string.") self.__splat = text temp_utts = [] for line in text.split("\n"): temp_utts.append(line.strip()) self.__utterances = temp_utts elif type(text) == str: self.__name = text[0:20] self.__splat = text temp_utts = [] for line in text.split("\n"): temp_utts.append(line.strip()) self.__utterances = temp_utts else: raise ValueError("WARNING: SPLAT must be of type str or file.") self.__uttcount = len(self.__utterances) self.__sentences = CleanSentenizer().sentenize(self.__splat) if self.__sentences == []: self.__sentences = self.__utterances self.__sentcount = len(self.__sentences) self.__rawtokens = RawTokenizer().tokenize(self.__splat) self.__tokens = CleanTokenizer().tokenize(self.__splat) self.__rawtypes = Util.typify(self.__rawtokens) self.__types = Util.typify(self.__tokens) self.__wordcount = Util.wordcount(self.__rawtokens) self.__unique_wordcount = Util.wordcount(self.__types) self.__ttr = Util.type_token_ratio(self.__types, self.__tokens) self.__alu = round(float(self.__wordcount) / float(self.__uttcount), 4) if self.__uttcount != 0 else 0.0 self.__als = round(float(self.__wordcount) / float(self.__sentcount), 4) if self.__sentcount != 0 else 0.0 temp_dpu = Util.count_disfluencies(self.__utterances) self.__dpu = temp_dpu[0] self.__adpu = temp_dpu[1] temp_dps = Util.count_disfluencies(self.__sentences) self.__dps = temp_dps[0] self.__adps = temp_dps[1] self.__disfluencies = Util.total_disfluencies(self.__dpu)
def pronouns(self): """ Prints out the pronoun statistics nicely. """ if self.__pronouns == {}: self.__pronouns = Util.count_pronouns(self.tokens()) template = "{0:12} {1:^12} {2:50}" print(template.format("Pronoun", "Frequency", "Type")) for k, v in self.__pronouns.items(): print( template.format(k, v[0], v[1] + " " + v[2] + " (" + v[3] + ")")) return ''
def get_most_freq(self, x=None): """ Returns the x most frequent words with their frequencies, or all words with their frequencies if x is not specified. :param x: the number of most frequent words to return """ if self.__freq_dist is None: self.__freq_dist = Util.get_freq_dist(self.__tokens) if x is None: return self.__freq_dist.most_common() elif x > 0: return self.__freq_dist.most_common(x) else: return self.__freq_dist.most_common()
def drawtrees(self): """ Uses matplotlib and nltk to draw syntactic parsers trees. """ Util.draw_trees(self.treestrings()) return ''
def pos_counts(self): """ Returns a dictionary with POS tags as keys and their frequencies as values. """ if self.__poscounts is None: self.__poscounts = Util.get_pos_counts(self.pos()) return self.__poscounts
def unique_function_words(self): """ Returns a list of unique function words. """ if self.__u_fwords is None: self.__u_fwords = Util.get_unique_function_words(self.__types) return self.__u_fwords
def unique_content_words(self): """ Returns a list of unique content words. """ if self.__u_cwords is None: self.__u_cwords = Util.get_unique_content_words(self.__types) return self.__u_cwords
def function_words(self): """ Returns a list of function words. """ if self.__fwords is None: self.__fwords = Util.get_function_words(self.__tokens) return self.__fwords
def run(self, data): results = [] try: for corpus in data: temp_bubble = SPLAT(corpus.contents) print(corpus.contents) print(temp_bubble.sents()) raw_disfluencies = Util.count_disfluencies(temp_bubble.sents()) print(raw_disfluencies) sentences = {} average_disfluencies = 0 um_count, uh_count, ah_count, er_count, hm_count, sl_count, rep_count, brk_count = ( 0, ) * 8 # Sort the data so it looks better in JSON for i in raw_disfluencies[0]: temp_dis = { "UM": raw_disfluencies[0][i][0], "UH": raw_disfluencies[0][i][1], "AH": raw_disfluencies[0][i][2], "ER": raw_disfluencies[0][i][3], "HM": raw_disfluencies[0][i][4], "SILENT PAUSE": raw_disfluencies[0][i][5], "REPETITION": raw_disfluencies[0][i][6], "BREAK": raw_disfluencies[0][i][7] } sentences[i] = temp_dis for (k, v) in temp_dis.items(): # Gather total disfluencies for each disfluency type average_disfluencies += v if k == "UM": um_count += v elif k == "UH": uh_count += v elif k == "AH": ah_count += v elif k == "ER": er_count += v elif k == "HM": hm_count += v elif k == "SILENT PAUSE": sl_count += v elif k == "REPETITION": rep_count += v elif k == "BREAK": brk_count += v temp_total = average_disfluencies # Calculate the average disfluencies per sentence in the whole text average_disfluencies = float(average_disfluencies / len(raw_disfluencies[0])) total_disfluencies = { "UM": um_count, "UH": uh_count, "AH": ah_count, "ER": er_count, "HM": hm_count, "SILENT PAUSE": sl_count, "REPETITION": rep_count, "BREAK": brk_count, "TOTAL": temp_total } results.append({ 'corpus_id': corpus.id, 'sentences': sentences, 'average_disfluencies_per_sentence': average_disfluencies, 'total_disfluencies': total_disfluencies }) results = json.dumps(results) print(results) return results except TypeError: raise TransactionException('Corpus contents does not exist.')
def raw_pronouns(self): """ Returns the raw output of /splat/Util.count_pronouns(). """ if self.__pronouns == {}: self.__pronouns = Util.count_pronouns(self.tokens()) return self.__pronouns
def content_function_ratio(self): """ Returns the ratio of content words to function words. """ if self.__cfr is None: self.__cfr = Util.get_content_function_ratio(self.content_words(), self.function_words()) return self.__cfr
def content_words(self): """ Returns a list of content words. """ if self.__cwords is None: self.__cwords = Util.get_content_words(self.__tokens) return self.__cwords
def max_depth(self): """ Returns the maxdepth of all syntactic parsers trees. """ if self.__maxdepth is None: self.__maxdepth = Util.get_max_depth(self.treestrings()) return self.__maxdepth
def content_function_ratio(self): """ Returns the ratio of content words to function words. """ if self.__cfr is None: self.__cfr = Util.get_content_function_ratio( self.content_words(), self.function_words()) return self.__cfr