예제 #1
0
    def distFrequency(self):
        """
        generate a distribution frequency of words for this file
        :return:
        """
        file = open(path.join(getTextPath(), "bannedWords"), "r")
        notInterrestingWords = file.readline(1)[0].split(' ')
        if self.__distFreq is None:
            lines = self.getLines()
            arrayDistFrequency = []
            for line in lines:
                words = [word.lower() for word in self.readIpu(line)]
                for word in notInterrestingWords:
                    if word in words:
                        words.remove(word)

                arrayDistFrequency.append(FreqDist(words))

            sum = FreqDist()
            for distFrequency in arrayDistFrequency:
                sum += distFrequency

            self.__distFreq = sum

        return self.__distFreq
예제 #2
0
    def initProperties(self):

        file = open(path.join(getTextPath(), "bannedWords"), "r")
        notInterrestingWords = file.readline(1)[0].split(' ')
        lines = self.getLines()
        self.__duration = math.floor(
            float(lines[-1].split(self.__corpus.getDelimiter())[2]) -
            float(lines[0].split(
                self.__corpus.getDelimiter())[1]))  # init duration

        uniqueWords = set()

        for line in lines:
            self.__nbOfLines += 1

            words = self.readIpu(line)
            for word in words:
                if word not in notInterrestingWords:
                    self.__numberOfWords += len(word.split('_'))
                    uniqueWords.add(word)
        self.__numberUniqueWords = len(uniqueWords)
예제 #3
0
def isFeedBackIpu_en(ipuContent):
    return isSpecialIpu(ipuContent, 0.5, path.join(getTextPath(),
                                                   "feedback_en"),
                        path.join(getTextPath(), "neutral_en"), 3)
예제 #4
0
def isntFeedBackIpu_fr(ipuContent):
    return not isSpecialIpu(ipuContent, 0.5,
                            path.join(getTextPath(), "feedback_fr"),
                            path.join(getTextPath(), "neutral_fr"), 3)
예제 #5
0
def isFill(ipuContent):
    return isSpecialIpu(ipuContent, 1, path.join(getTextPath(), "fill_fr"))
예제 #6
0
def analysisInManyDimensions(arrayOfCorpus):
    data = []

    # search for each corpus the infos we want
    for corpus in arrayOfCorpus:
        corpusData = []
        # variable
        corpusData.append(corpus.getMeanLexicalRichness(forEachFile=True))

        # variable
        nbWords = corpus.getNumberOfWords(forEachFile=True)
        nbFill = corpus.countSpecialWords(open(
            path.join(getTextPath(), "fill_" + corpus.getLanguage()),
            "r").readlines()[0].split(','),
                                          forEachFile=True)

        temp = []
        for i in range(0, len(nbFill)):
            if nbWords[i] > 0:
                temp.append(nbFill[i] / nbWords[i])
            else:
                temp.append(0)
        corpusData.append(temp)

        # variable
        corpusData.append(
            corpus.getRatioSpecialIpu("feedback_" + corpus.getLanguage(),
                                      forEachFile=True))

        # variable
        corpusData.append(
            corpus.getSpecialIpuMeanSize("not feedback_" +
                                         corpus.getLanguage(),
                                         forEachFile=True))

        # variable
        formality = corpus.countSpecialWords(open(
            path.join(getTextPath(), "formality_" + corpus.getLanguage()),
            "r").readlines()[0].split(','),
                                             forEachFile=True)

        lowFormality = corpus.countSpecialWords(open(
            path.join(getTextPath(), "lowFormality_" + corpus.getLanguage()),
            "r").readlines()[0].split(','),
                                                forEachFile=True)

        temp = []
        for i in range(0, len(formality)):
            if formality[i] > 0:
                temp.append(lowFormality[i] / formality[i])
            else:
                temp.append(lowFormality[i])
        corpusData.append(temp)

        # label
        corpusName = corpus.getName()
        corpusData.append([corpusName] * corpus.getNbOfFiles())
        ####
        data.append(corpusData)
    #  changing format to create a pandas dataFrame
    tempData = []
    for i in range(0, len(data[0])):
        temp = []
        for corpusData in data:
            temp.extend(corpusData[i])
        tempData.append(pd.Series(temp))

    data = tempData

    dataFrame = pd.DataFrame({
        'lexical richness': data[0],
        'ratio fill': data[1],
        'ratio ipu feedback': data[2],
        'mean size not feedback IPU': data[3],
        'formality ratio': data[4],
        'label': data[5]
    })
    return dataFrame