def distFrequency(self): """ generate a distribution frequency of words for this file :return: """ file = open(path.join(getTextPath(), "bannedWords"), "r") notInterrestingWords = file.readline(1)[0].split(' ') if self.__distFreq is None: lines = self.getLines() arrayDistFrequency = [] for line in lines: words = [word.lower() for word in self.readIpu(line)] for word in notInterrestingWords: if word in words: words.remove(word) arrayDistFrequency.append(FreqDist(words)) sum = FreqDist() for distFrequency in arrayDistFrequency: sum += distFrequency self.__distFreq = sum return self.__distFreq
def initProperties(self): file = open(path.join(getTextPath(), "bannedWords"), "r") notInterrestingWords = file.readline(1)[0].split(' ') lines = self.getLines() self.__duration = math.floor( float(lines[-1].split(self.__corpus.getDelimiter())[2]) - float(lines[0].split( self.__corpus.getDelimiter())[1])) # init duration uniqueWords = set() for line in lines: self.__nbOfLines += 1 words = self.readIpu(line) for word in words: if word not in notInterrestingWords: self.__numberOfWords += len(word.split('_')) uniqueWords.add(word) self.__numberUniqueWords = len(uniqueWords)
def isFeedBackIpu_en(ipuContent): return isSpecialIpu(ipuContent, 0.5, path.join(getTextPath(), "feedback_en"), path.join(getTextPath(), "neutral_en"), 3)
def isntFeedBackIpu_fr(ipuContent): return not isSpecialIpu(ipuContent, 0.5, path.join(getTextPath(), "feedback_fr"), path.join(getTextPath(), "neutral_fr"), 3)
def isFill(ipuContent): return isSpecialIpu(ipuContent, 1, path.join(getTextPath(), "fill_fr"))
def analysisInManyDimensions(arrayOfCorpus): data = [] # search for each corpus the infos we want for corpus in arrayOfCorpus: corpusData = [] # variable corpusData.append(corpus.getMeanLexicalRichness(forEachFile=True)) # variable nbWords = corpus.getNumberOfWords(forEachFile=True) nbFill = corpus.countSpecialWords(open( path.join(getTextPath(), "fill_" + corpus.getLanguage()), "r").readlines()[0].split(','), forEachFile=True) temp = [] for i in range(0, len(nbFill)): if nbWords[i] > 0: temp.append(nbFill[i] / nbWords[i]) else: temp.append(0) corpusData.append(temp) # variable corpusData.append( corpus.getRatioSpecialIpu("feedback_" + corpus.getLanguage(), forEachFile=True)) # variable corpusData.append( corpus.getSpecialIpuMeanSize("not feedback_" + corpus.getLanguage(), forEachFile=True)) # variable formality = corpus.countSpecialWords(open( path.join(getTextPath(), "formality_" + corpus.getLanguage()), "r").readlines()[0].split(','), forEachFile=True) lowFormality = corpus.countSpecialWords(open( path.join(getTextPath(), "lowFormality_" + corpus.getLanguage()), "r").readlines()[0].split(','), forEachFile=True) temp = [] for i in range(0, len(formality)): if formality[i] > 0: temp.append(lowFormality[i] / formality[i]) else: temp.append(lowFormality[i]) corpusData.append(temp) # label corpusName = corpus.getName() corpusData.append([corpusName] * corpus.getNbOfFiles()) #### data.append(corpusData) # changing format to create a pandas dataFrame tempData = [] for i in range(0, len(data[0])): temp = [] for corpusData in data: temp.extend(corpusData[i]) tempData.append(pd.Series(temp)) data = tempData dataFrame = pd.DataFrame({ 'lexical richness': data[0], 'ratio fill': data[1], 'ratio ipu feedback': data[2], 'mean size not feedback IPU': data[3], 'formality ratio': data[4], 'label': data[5] }) return dataFrame