Exemplo n.º 1
0
 def plot_freq(self, x=None):
     """ Uses matplotlib to graph the frequency distribution.
     :param x:
     """
     if self.__freq_dist is None:
         self.__freq_dist = Util.get_freq_dist(self.__tokens)
     Util.plot_freq_dist(self.__freq_dist, x)
     return ''
Exemplo n.º 2
0
 def plot_freq(self, x=None):
     """ Uses matplotlib to graph the frequency distribution.
     :param x:
     """
     if self.__freq_dist is None:
         self.__freq_dist = Util.get_freq_dist(self.__tokens)
     Util.plot_freq_dist(self.__freq_dist, x)
     return ''
Exemplo n.º 3
0
    def get_least_freq(self, x=None):
        """
        Returns the x least frequent words with their frequencies,
        or all words with their frequencies if x is not specified.
        :param x: the number of least frequent words to return
        """
        if self.__freq_dist is None:
            self.__freq_dist = Util.get_freq_dist(self.__tokens)

        most_common = self.__freq_dist.most_common()
        freq_dist = []
        count = 0
        for item in reversed(most_common):
                freq_dist.append(item)
        temp_freq = []

        if x is None:
            freq_dist = freq_dist
        else:
            for item in freq_dist:
                if count < int(x):
                    temp_freq.append(item)
                    count += 1

            freq_dist = temp_freq

        return freq_dist
Exemplo n.º 4
0
    def get_least_freq(self, x=None):
        """
        Returns the x least frequent words with their frequencies,
        or all words with their frequencies if x is not specified.
        :param x: the number of least frequent words to return
        """
        if self.__freq_dist is None:
            self.__freq_dist = Util.get_freq_dist(self.__tokens)

        most_common = self.__freq_dist.most_common()
        freq_dist = []
        count = 0
        for item in reversed(most_common):
            freq_dist.append(item)
        temp_freq = []

        if x is None:
            freq_dist = freq_dist
        else:
            for item in freq_dist:
                if count < int(x):
                    temp_freq.append(item)
                    count += 1

            freq_dist = temp_freq

        return freq_dist
Exemplo n.º 5
0
    def __init__(self, text):
        """
        Creates a SPLAT Object.
        """
        if os.path.exists(text):
            temp_text = ""
            temp_utts = []
            self.__name = text
            try:
                for line in open(text, 'r'):
                    temp_utts.append(line.strip())
                    temp_text += line.strip() + " "
                self.__splat = temp_text
                self.__utterances = temp_utts
            except IsADirectoryError:
                print("WARNING: '" + text +
                      "' is a directory. It is being treated as a string.")
                self.__splat = text
                temp_utts = []
                for line in text.split("\n"):
                    temp_utts.append(line.strip())
                self.__utterances = temp_utts
        elif type(text) == str:
            self.__name = text[0:20]
            self.__splat = text
            temp_utts = []
            for line in text.split("\n"):
                temp_utts.append(line.strip())
            self.__utterances = temp_utts
        else:
            raise ValueError("WARNING: SPLAT must be of type str or file.")

        self.__uttcount = len(self.__utterances)
        self.__sentences = CleanSentenizer().sentenize(self.__splat)
        if self.__sentences == []: self.__sentences = self.__utterances
        self.__sentcount = len(self.__sentences)
        self.__rawtokens = RawTokenizer().tokenize(self.__splat)
        self.__tokens = CleanTokenizer().tokenize(self.__splat)
        self.__rawtypes = Util.typify(self.__rawtokens)
        self.__types = Util.typify(self.__tokens)
        self.__wordcount = Util.wordcount(self.__rawtokens)
        self.__unique_wordcount = Util.wordcount(self.__types)
        self.__ttr = Util.type_token_ratio(self.__types, self.__tokens)
        self.__alu = round(
            float(self.__wordcount) /
            float(self.__uttcount), 4) if self.__uttcount != 0 else 0.0
        self.__als = round(
            float(self.__wordcount) /
            float(self.__sentcount), 4) if self.__sentcount != 0 else 0.0
        temp_dpu = Util.count_disfluencies(self.__utterances)
        self.__dpu = temp_dpu[0]
        self.__adpu = temp_dpu[1]
        temp_dps = Util.count_disfluencies(self.__sentences)
        self.__dps = temp_dps[0]
        self.__adps = temp_dps[1]
        self.__disfluencies = Util.total_disfluencies(self.__dpu)
Exemplo n.º 6
0
    def pronouns(self):
        """ Prints out the pronoun statistics nicely. """
        if self.__pronouns == {}:
            self.__pronouns = Util.count_pronouns(self.tokens())

        template = "{0:12} {1:^12} {2:50}"
        print(template.format("Pronoun", "Frequency", "Type"))
        for k, v in self.__pronouns.items():
            print(template.format(k, v[0], v[1] + " " + v[2] + " (" + v[3] + ")"))

        return ''
Exemplo n.º 7
0
    def __init__(self, text):
        """
        Creates a SPLAT Object.
        """
        if os.path.isfile(text):
            temp_text = ""
            temp_utts = []
            self.__name = text
            try:
                for line in open(text, 'r'):
                    temp_utts.append(line.strip())
                    temp_text += line.strip() + " "
                self.__splat = temp_text
                self.__utterances = temp_utts
            except IsADirectoryError:
                print("WARNING: '" + text + "' is a directory. It is being treated as a string.")
                self.__splat = text
                temp_utts = []
                for line in text.split("\n"):
                    temp_utts.append(line.strip())
                self.__utterances = temp_utts
        elif type(text) == str:
            self.__name = text[0:20]
            self.__splat = text
            temp_utts = []
            for line in text.split("\n"):
                temp_utts.append(line.strip())
            self.__utterances = temp_utts
        else:
            raise ValueError("WARNING: SPLAT must be of type str or file.")

        self.__uttcount = len(self.__utterances)
        self.__sentences = CleanSentenizer().sentenize(self.__splat)
        if self.__sentences == []: self.__sentences = self.__utterances
        self.__sentcount = len(self.__sentences)
        self.__rawtokens = RawTokenizer().tokenize(self.__splat)
        self.__tokens = CleanTokenizer().tokenize(self.__splat)
        self.__rawtypes = Util.typify(self.__rawtokens)
        self.__types = Util.typify(self.__tokens)
        self.__wordcount = Util.wordcount(self.__rawtokens)
        self.__unique_wordcount = Util.wordcount(self.__types)
        self.__ttr = Util.type_token_ratio(self.__types, self.__tokens)
        self.__alu = round(float(self.__wordcount) / float(self.__uttcount), 4) if self.__uttcount != 0 else 0.0
        self.__als = round(float(self.__wordcount) / float(self.__sentcount), 4) if self.__sentcount != 0 else 0.0
        temp_dpu = Util.count_disfluencies(self.__utterances)
        self.__dpu = temp_dpu[0]
        self.__adpu = temp_dpu[1]
        temp_dps = Util.count_disfluencies(self.__sentences)
        self.__dps = temp_dps[0]
        self.__adps = temp_dps[1]
        self.__disfluencies = Util.total_disfluencies(self.__dpu)
Exemplo n.º 8
0
    def pronouns(self):
        """ Prints out the pronoun statistics nicely. """
        if self.__pronouns == {}:
            self.__pronouns = Util.count_pronouns(self.tokens())

        template = "{0:12} {1:^12} {2:50}"
        print(template.format("Pronoun", "Frequency", "Type"))
        for k, v in self.__pronouns.items():
            print(
                template.format(k, v[0],
                                v[1] + " " + v[2] + " (" + v[3] + ")"))

        return ''
Exemplo n.º 9
0
 def get_most_freq(self, x=None):
     """
     Returns the x most frequent words with their frequencies,
     or all words with their frequencies if x is not specified.
     :param x: the number of most frequent words to return
     """
     if self.__freq_dist is None:
         self.__freq_dist = Util.get_freq_dist(self.__tokens)
     if x is None:
         return self.__freq_dist.most_common()
     elif x > 0:
         return self.__freq_dist.most_common(x)
     else:
         return self.__freq_dist.most_common()
Exemplo n.º 10
0
 def get_most_freq(self, x=None):
     """
     Returns the x most frequent words with their frequencies,
     or all words with their frequencies if x is not specified.
     :param x: the number of most frequent words to return
     """
     if self.__freq_dist is None:
         self.__freq_dist = Util.get_freq_dist(self.__tokens)
     if x is None:
         return self.__freq_dist.most_common()
     elif x > 0:
         return self.__freq_dist.most_common(x)
     else:
         return self.__freq_dist.most_common()
Exemplo n.º 11
0
 def drawtrees(self):
     """ Uses matplotlib and nltk to draw syntactic parsers trees. """
     Util.draw_trees(self.treestrings())
     return ''
Exemplo n.º 12
0
 def pos_counts(self):
     """ Returns a dictionary with POS tags as keys and their frequencies as values. """
     if self.__poscounts is None:
         self.__poscounts = Util.get_pos_counts(self.pos())
     return self.__poscounts
Exemplo n.º 13
0
 def unique_function_words(self):
     """ Returns a list of unique function words. """
     if self.__u_fwords is None:
         self.__u_fwords = Util.get_unique_function_words(self.__types)
     return self.__u_fwords
Exemplo n.º 14
0
 def unique_content_words(self):
     """ Returns a list of unique content words. """
     if self.__u_cwords is None:
         self.__u_cwords = Util.get_unique_content_words(self.__types)
     return self.__u_cwords
Exemplo n.º 15
0
 def function_words(self):
     """ Returns a list of function words. """
     if self.__fwords is None:
         self.__fwords = Util.get_function_words(self.__tokens)
     return self.__fwords
Exemplo n.º 16
0
 def unique_function_words(self):
     """ Returns a list of unique function words. """
     if self.__u_fwords is None:
         self.__u_fwords = Util.get_unique_function_words(self.__types)
     return self.__u_fwords
Exemplo n.º 17
0
    def run(self, data):
        results = []
        try:
            for corpus in data:
                temp_bubble = SPLAT(corpus.contents)
                print(corpus.contents)
                print(temp_bubble.sents())
                raw_disfluencies = Util.count_disfluencies(temp_bubble.sents())
                print(raw_disfluencies)
                sentences = {}
                average_disfluencies = 0
                um_count, uh_count, ah_count, er_count, hm_count, sl_count, rep_count, brk_count = (
                    0, ) * 8
                # Sort the data so it looks better in JSON
                for i in raw_disfluencies[0]:
                    temp_dis = {
                        "UM": raw_disfluencies[0][i][0],
                        "UH": raw_disfluencies[0][i][1],
                        "AH": raw_disfluencies[0][i][2],
                        "ER": raw_disfluencies[0][i][3],
                        "HM": raw_disfluencies[0][i][4],
                        "SILENT PAUSE": raw_disfluencies[0][i][5],
                        "REPETITION": raw_disfluencies[0][i][6],
                        "BREAK": raw_disfluencies[0][i][7]
                    }
                    sentences[i] = temp_dis
                    for (k, v) in temp_dis.items():
                        # Gather total disfluencies for each disfluency type
                        average_disfluencies += v
                        if k == "UM":
                            um_count += v
                        elif k == "UH":
                            uh_count += v
                        elif k == "AH":
                            ah_count += v
                        elif k == "ER":
                            er_count += v
                        elif k == "HM":
                            hm_count += v
                        elif k == "SILENT PAUSE":
                            sl_count += v
                        elif k == "REPETITION":
                            rep_count += v
                        elif k == "BREAK":
                            brk_count += v

                temp_total = average_disfluencies

                # Calculate the average disfluencies per sentence in the whole text
                average_disfluencies = float(average_disfluencies /
                                             len(raw_disfluencies[0]))

                total_disfluencies = {
                    "UM": um_count,
                    "UH": uh_count,
                    "AH": ah_count,
                    "ER": er_count,
                    "HM": hm_count,
                    "SILENT PAUSE": sl_count,
                    "REPETITION": rep_count,
                    "BREAK": brk_count,
                    "TOTAL": temp_total
                }

                results.append({
                    'corpus_id': corpus.id,
                    'sentences': sentences,
                    'average_disfluencies_per_sentence': average_disfluencies,
                    'total_disfluencies': total_disfluencies
                })
            results = json.dumps(results)
            print(results)
            return results
        except TypeError:
            raise TransactionException('Corpus contents does not exist.')
Exemplo n.º 18
0
 def raw_pronouns(self):
     """ Returns the raw output of /splat/Util.count_pronouns(). """
     if self.__pronouns == {}:
         self.__pronouns = Util.count_pronouns(self.tokens())
     return self.__pronouns
Exemplo n.º 19
0
 def content_function_ratio(self):
     """ Returns the ratio of content words to function words. """
     if self.__cfr is None:
         self.__cfr = Util.get_content_function_ratio(self.content_words(), self.function_words())
     return self.__cfr
Exemplo n.º 20
0
 def content_words(self):
     """ Returns a list of content words. """
     if self.__cwords is None:
         self.__cwords = Util.get_content_words(self.__tokens)
     return self.__cwords
Exemplo n.º 21
0
 def function_words(self):
     """ Returns a list of function words. """
     if self.__fwords is None:
         self.__fwords = Util.get_function_words(self.__tokens)
     return self.__fwords
Exemplo n.º 22
0
 def max_depth(self):
     """ Returns the maxdepth of all syntactic parsers trees. """
     if self.__maxdepth is None:
         self.__maxdepth = Util.get_max_depth(self.treestrings())
     return self.__maxdepth
Exemplo n.º 23
0
 def drawtrees(self):
     """ Uses matplotlib and nltk to draw syntactic parsers trees. """
     Util.draw_trees(self.treestrings())
     return ''
Exemplo n.º 24
0
 def pos_counts(self):
     """ Returns a dictionary with POS tags as keys and their frequencies as values. """
     if self.__poscounts is None:
         self.__poscounts = Util.get_pos_counts(self.pos())
     return self.__poscounts
Exemplo n.º 25
0
 def max_depth(self):
     """ Returns the maxdepth of all syntactic parsers trees. """
     if self.__maxdepth is None:
         self.__maxdepth = Util.get_max_depth(self.treestrings())
     return self.__maxdepth
Exemplo n.º 26
0
 def content_function_ratio(self):
     """ Returns the ratio of content words to function words. """
     if self.__cfr is None:
         self.__cfr = Util.get_content_function_ratio(
             self.content_words(), self.function_words())
     return self.__cfr
Exemplo n.º 27
0
 def content_words(self):
     """ Returns a list of content words. """
     if self.__cwords is None:
         self.__cwords = Util.get_content_words(self.__tokens)
     return self.__cwords
Exemplo n.º 28
0
 def raw_pronouns(self):
     """ Returns the raw output of /splat/Util.count_pronouns(). """
     if self.__pronouns == {}:
         self.__pronouns = Util.count_pronouns(self.tokens())
     return self.__pronouns
Exemplo n.º 29
0
 def unique_content_words(self):
     """ Returns a list of unique content words. """
     if self.__u_cwords is None:
         self.__u_cwords = Util.get_unique_content_words(self.__types)
     return self.__u_cwords