예제 #1
0
    def __init__(self, text):
        """
        Creates a SPLAT Object.
        """
        if os.path.exists(text):
            temp_text = ""
            temp_utts = []
            self.__name = text
            try:
                for line in open(text, 'r'):
                    temp_utts.append(line.strip())
                    temp_text += line.strip() + " "
                self.__splat = temp_text
                self.__utterances = temp_utts
            except IsADirectoryError:
                print("WARNING: '" + text +
                      "' is a directory. It is being treated as a string.")
                self.__splat = text
                temp_utts = []
                for line in text.split("\n"):
                    temp_utts.append(line.strip())
                self.__utterances = temp_utts
        elif type(text) == str:
            self.__name = text[0:20]
            self.__splat = text
            temp_utts = []
            for line in text.split("\n"):
                temp_utts.append(line.strip())
            self.__utterances = temp_utts
        else:
            raise ValueError("WARNING: SPLAT must be of type str or file.")

        self.__uttcount = len(self.__utterances)
        self.__sentences = CleanSentenizer().sentenize(self.__splat)
        if self.__sentences == []: self.__sentences = self.__utterances
        self.__sentcount = len(self.__sentences)
        self.__rawtokens = RawTokenizer().tokenize(self.__splat)
        self.__tokens = CleanTokenizer().tokenize(self.__splat)
        self.__rawtypes = Util.typify(self.__rawtokens)
        self.__types = Util.typify(self.__tokens)
        self.__wordcount = Util.wordcount(self.__rawtokens)
        self.__unique_wordcount = Util.wordcount(self.__types)
        self.__ttr = Util.type_token_ratio(self.__types, self.__tokens)
        self.__alu = round(
            float(self.__wordcount) /
            float(self.__uttcount), 4) if self.__uttcount != 0 else 0.0
        self.__als = round(
            float(self.__wordcount) /
            float(self.__sentcount), 4) if self.__sentcount != 0 else 0.0
        temp_dpu = Util.count_disfluencies(self.__utterances)
        self.__dpu = temp_dpu[0]
        self.__adpu = temp_dpu[1]
        temp_dps = Util.count_disfluencies(self.__sentences)
        self.__dps = temp_dps[0]
        self.__adps = temp_dps[1]
        self.__disfluencies = Util.total_disfluencies(self.__dpu)
예제 #2
0
파일: SPLAT.py 프로젝트: meyersbs/SPLAT
    def __init__(self, text):
        """
        Creates a SPLAT Object.
        """
        if os.path.isfile(text):
            temp_text = ""
            temp_utts = []
            self.__name = text
            try:
                for line in open(text, 'r'):
                    temp_utts.append(line.strip())
                    temp_text += line.strip() + " "
                self.__splat = temp_text
                self.__utterances = temp_utts
            except IsADirectoryError:
                print("WARNING: '" + text + "' is a directory. It is being treated as a string.")
                self.__splat = text
                temp_utts = []
                for line in text.split("\n"):
                    temp_utts.append(line.strip())
                self.__utterances = temp_utts
        elif type(text) == str:
            self.__name = text[0:20]
            self.__splat = text
            temp_utts = []
            for line in text.split("\n"):
                temp_utts.append(line.strip())
            self.__utterances = temp_utts
        else:
            raise ValueError("WARNING: SPLAT must be of type str or file.")

        self.__uttcount = len(self.__utterances)
        self.__sentences = CleanSentenizer().sentenize(self.__splat)
        if self.__sentences == []: self.__sentences = self.__utterances
        self.__sentcount = len(self.__sentences)
        self.__rawtokens = RawTokenizer().tokenize(self.__splat)
        self.__tokens = CleanTokenizer().tokenize(self.__splat)
        self.__rawtypes = Util.typify(self.__rawtokens)
        self.__types = Util.typify(self.__tokens)
        self.__wordcount = Util.wordcount(self.__rawtokens)
        self.__unique_wordcount = Util.wordcount(self.__types)
        self.__ttr = Util.type_token_ratio(self.__types, self.__tokens)
        self.__alu = round(float(self.__wordcount) / float(self.__uttcount), 4) if self.__uttcount != 0 else 0.0
        self.__als = round(float(self.__wordcount) / float(self.__sentcount), 4) if self.__sentcount != 0 else 0.0
        temp_dpu = Util.count_disfluencies(self.__utterances)
        self.__dpu = temp_dpu[0]
        self.__adpu = temp_dpu[1]
        temp_dps = Util.count_disfluencies(self.__sentences)
        self.__dps = temp_dps[0]
        self.__adps = temp_dps[1]
        self.__disfluencies = Util.total_disfluencies(self.__dpu)
예제 #3
0
    def run(self, data):
        results = []
        try:
            for corpus in data:
                temp_bubble = SPLAT(corpus.contents)
                print(corpus.contents)
                print(temp_bubble.sents())
                raw_disfluencies = Util.count_disfluencies(temp_bubble.sents())
                print(raw_disfluencies)
                sentences = {}
                average_disfluencies = 0
                um_count, uh_count, ah_count, er_count, hm_count, sl_count, rep_count, brk_count = (
                    0, ) * 8
                # Sort the data so it looks better in JSON
                for i in raw_disfluencies[0]:
                    temp_dis = {
                        "UM": raw_disfluencies[0][i][0],
                        "UH": raw_disfluencies[0][i][1],
                        "AH": raw_disfluencies[0][i][2],
                        "ER": raw_disfluencies[0][i][3],
                        "HM": raw_disfluencies[0][i][4],
                        "SILENT PAUSE": raw_disfluencies[0][i][5],
                        "REPETITION": raw_disfluencies[0][i][6],
                        "BREAK": raw_disfluencies[0][i][7]
                    }
                    sentences[i] = temp_dis
                    for (k, v) in temp_dis.items():
                        # Gather total disfluencies for each disfluency type
                        average_disfluencies += v
                        if k == "UM":
                            um_count += v
                        elif k == "UH":
                            uh_count += v
                        elif k == "AH":
                            ah_count += v
                        elif k == "ER":
                            er_count += v
                        elif k == "HM":
                            hm_count += v
                        elif k == "SILENT PAUSE":
                            sl_count += v
                        elif k == "REPETITION":
                            rep_count += v
                        elif k == "BREAK":
                            brk_count += v

                temp_total = average_disfluencies

                # Calculate the average disfluencies per sentence in the whole text
                average_disfluencies = float(average_disfluencies /
                                             len(raw_disfluencies[0]))

                total_disfluencies = {
                    "UM": um_count,
                    "UH": uh_count,
                    "AH": ah_count,
                    "ER": er_count,
                    "HM": hm_count,
                    "SILENT PAUSE": sl_count,
                    "REPETITION": rep_count,
                    "BREAK": brk_count,
                    "TOTAL": temp_total
                }

                results.append({
                    'corpus_id': corpus.id,
                    'sentences': sentences,
                    'average_disfluencies_per_sentence': average_disfluencies,
                    'total_disfluencies': total_disfluencies
                })
            results = json.dumps(results)
            print(results)
            return results
        except TypeError:
            raise TransactionException('Corpus contents does not exist.')