def applyEvaluator(toBeEvalPath, goldStandPath, resultDfPath=None, verbose=False): ''' ''' positiveEvalCounter = 0 #open the ororazed test dataframe from the path toBeEvalDf = myUtils.getDataFrameFromArgs(toBeEvalPath, header=False)[0] #open the goldStandard test dataframe from the path goldEvalDf = myUtils.getDataFrameFromArgs(goldStandPath, header=False)[0] #browse for index, goldStandard in goldEvalDf.iteritems(): toBeEval = toBeEvalDf[index] #evaluation if the normalized output corresponds to the gold standard positiveEvalCounter, evaluation = normalizationEvaluator(toBeEval, goldStandard, positiveEvalCounter) #save in dataframe, the gold standard df is now a result df goldEvalDf[index] = evaluation #dump result df if resultDfPath != None: goldEvalDf.to_csv(resultDfPath, sep='\t', index=False) #dump result with open(u'{0}.results'.format(resultDfPath), u'w', encoding=u'utf8') as resultFile: resultFile.write( u'NORMALIZATION RESULTS\nratio\texact positives\ttotal comments\n{0}\t{1}\t{2}'.format( (float(positiveEvalCounter)/float(len(goldEvalDf))), positiveEvalCounter, len(goldEvalDf) ) ) #print if needed if verbose != False: print(u'ratio\texact positives\ttotal comments') print(float(positiveEvalCounter)/float(len(goldEvalDf)), positiveEvalCounter, len(goldEvalDf)) return goldEvalDf
def applyEvaluator(toBeEvalPath, goldStandPath, resultDfPath=None, verbose=False, spacyModel=None): ''' ''' positiveEvalCounter = 0 positiveEvalCounterPPP = 0 #open the ororazed test dataframe from the path toBeEvalDf = myUtils.getDataFrameFromArgs(toBeEvalPath, header=False)[0] #open the goldStandard test dataframe from the path goldEvalDf = myUtils.getDataFrameFromArgs(goldStandPath, header=False)[0] #make a result df resultDf = pd.DataFrame(0, index=np.arange(len(goldEvalDf)), columns=['exactMatch', 'pasAuPointPres']) #browse for index, goldStandard in goldEvalDf.iteritems(): toBeEval = toBeEvalDf[index] #choose the tokenizer goldStandard = u' '.join(myUtils.multTokenizer(goldStandard, whatTokenizer=0, spacyModel=spacyModel)) #evaluation if the normalized output corresponds to the gold standard positiveEvalCounter, evaluation = normalizationEvaluator(toBeEval, goldStandard, positiveEvalCounter) positiveEvalCounterPPP, evaluationPpp = normalizationPppEvaluator(toBeEval, goldStandard, positiveEvalCounterPPP) #save in dataframe result df resultDf['exactMatch'][index] = evaluation resultDf['pasAuPointPres'][index] = evaluationPpp #dump result df if resultDfPath != None: resultDf.to_csv(resultDfPath, sep='\t', index=False) #dump result with open(u'{0}.results'.format(resultDfPath), u'w', encoding=u'utf8') as resultFile: resultFile.write( u'NORMALIZATION RESULTS\nratio\texact positives\ttotal comments\n{0}\t{1}\t{2}'.format( (float(positiveEvalCounter)/float(len(goldEvalDf))), positiveEvalCounter, len(goldEvalDf) ) ) #dump trash results in a temp file trashSeries1 = toBeEvalDf.iloc[resultDf['exactMatch'].values == 0] trashSeries2 = goldEvalDf.iloc[resultDf['exactMatch'].values == 0] trashDf = pd.DataFrame(dict(SystemOutput=trashSeries1, GoldStandard=trashSeries2)) trashPath = u'./008result/trash.tsv' trashGrepSystOutputPath = u'./008result/trashSystOutput.tsv' trashGrepSystGoldStandPath = u'./008result/trashGoldStand.tsv' #if it already exists append to it if myUtils.theFileExists(trashPath): #prepare dump excel readeable tempDf = myUtils.getDataFrameFromArgs(trashPath, header=True) trashDf = pd.concat([tempDf, trashDf]).drop_duplicates() #dump excel readeable trashDf.to_csv(trashPath, sep='\t', index=False, header=True) #dump grepable files trashDf[u'SystemOutput'].to_csv(trashGrepSystOutputPath, sep='\t', index=False, header=False) trashDf[u'GoldStandard'].to_csv(trashGrepSystGoldStandPath, sep='\t', index=False, header=False) #print if needed if verbose != False: print(u'EXACT MATCH :') print(u'ratio\texact positives\ttotal comments') print(float(positiveEvalCounter)/float(len(resultDf)), positiveEvalCounter, len(resultDf)) print(u'PAS AU POINT PRES :') print(u'ratio\texact positives\ttotal comments') print(float(positiveEvalCounterPPP)/float(len(resultDf)), positiveEvalCounterPPP, len(resultDf)) return resultDf, ( float(positiveEvalCounter)/float(len(resultDf)), positiveEvalCounter, float(positiveEvalCounterPPP)/float(len(resultDf)), positiveEvalCounterPPP, len(resultDf) )
def applyLearnedDictPlusHumanDict( testOrigPath, normOutPath=None, learnedDictPath=u'./005learnedDict/ororaAbbreviationDict.json', humanDictPath=u'./005learnedDict/humanMadeDict/humanMadeOroraAbbreviationDict.json' ): ''' apply the normalization dict''' #open the dicts learnedDict = myUtils.openJsonFileAsDict(learnedDictPath) humanDict = myUtils.openJsonFileAsDict(humanDictPath) #open the test dataframe from the path testOrigDf = myUtils.getDataFrameFromArgs(testOrigPath, header=False)[0] for index, testComment in testOrigDf.iteritems(): #use the human dict FIRST (priority to the human-made dicts) normOutput = ororaZeAbbreviations(testComment, learnedDict, listTheVariations=False) #use the learned dict normOutput = ororaZeAbbreviations(normOutput, humanDict, listTheVariations=False) #save into pandas series testOrigDf[index] = normOutput #dump normalized output if normOutPath != None: testOrigDf.to_csv(normOutPath, sep=u'\t', index=False) return testOrigDf
def applyOroraze(transformedFilePath, ororazedPath=None, advanced=False): ''' ''' #open the transformed test dataframe from the path transformedFileDf = myUtils.getDataFrameFromArgs(transformedFilePath) #treat differently if it's a dataframe and if it's a pandas series if len(list(transformedFileDf)) == 1: #if it's a pandas series object transformedFileDf = transformedFileDf[0] for index, transformedComment in transformedFileDf.iteritems(): ororOutput = myUtils.ororaZe(transformedComment, advanced=advanced) #save to df transformedFileDf[index] = ororOutput else: #if it's a dataframe #ororaze the original comments ororazePartial = functools.partial(myUtils.ororaZe, advanced=advanced) transformedFileDf[u'CommentIn'] = transformedFileDf[ u'CommentIn'].apply(ororazePartial) #get rid of tabs in the column content transformedFileDf = transformedFileDf.applymap(myUtils.replaceTabs) #get rid of multiple spaces in both the original and the gold standard transformedFileDf = transformedFileDf.applymap( myUtils.eliminateMultipleSpaces) #dump normalized output if ororazedPath != None: transformedFileDf.to_csv(ororazedPath, sep='\t', index=False) return transformedFileDf
def makeSetsForCrossVal(origDf=u'./001ororazed/ororized.tsv', nbSegmentations=10, randomize=True, outputFolderPath=None): ''' given a dataframe, returns subsets of said dataframe in order to be used for cross-validation ''' listOfDfs = [] outputFolderPath = u'{0}/'.format( outputFolderPath) if outputFolderPath[-1] != u'/' else outputFolderPath #get the data frame origDf = myUtils.getDataFrameFromArgs(origDf) #if the nb of segmentation is a ratio (e.g., 0.25 for 25%), we transform it into an int nb of segmentations if type(nbSegmentations) is float and str(nbSegmentations)[0] == '0': nbSegmentations = int(1.0 / nbSegmentations) #get the size of each segment segmSize = float(len(origDf)) / float(nbSegmentations) #shuffle randomly the dataframe if randomize == True: origDf = origDf.sample(frac=1.0, replace=False) #populate the list with the segmented dataframes for n in range(1, int(nbSegmentations)): listOfDfs.append(origDf.iloc[int(segmSize * (n - 1)):int(segmSize * n)]) #append the last segment, containing the remaining elements of the df, this number #might slightly vary from the expected and uniform size of the other segments listOfDfs.append(origDf.iloc[int(segmSize * (n)):]) #dump the dataframes if outputFolderPath != None: myUtils.emptyTheFolder(outputFolderPath, fileExtensionOrListOfExtensions=u'tsv') for n, df in enumerate(listOfDfs): df[u'CommentIn'].to_csv(u'{0}crossValidationOrig{1}.tsv'.format( outputFolderPath, n), sep='\t', index=False) df[u'CommentOut'].to_csv(u'{0}crossValidationGS{1}.tsv'.format( outputFolderPath, n), sep='\t', index=False) return listOfDfs #makeTrainTestValidSetsFromTsv(origDf, ratioSizes, outputFolderPath) # makeSetsForCrossVal(origDf, nbSegmentations, True, outputFolderPath)
def applyNormalisation(testOrigPath, normOutPath=None, normalization=None, *args): ''' apply the normalization dict''' #if we are given a path to the place where the dict is if type(normalization) is str: normalization = myUtils.openJsonFileAsDict(normalization) #start an empty dejavuDict dejavuDict = {} c = 0 #open the test dataframe from the path testOrigDf = myUtils.getDataFrameFromArgs(testOrigPath, header=False)[0] for index, testComment in testOrigDf.iteritems(): if normalization == None: normOutput = testComment #use the dict as a normalization elif type(normalization) is dict: normOutput, c = ororaZeAbbreviations(testComment, normalization, listTheVariations=True, c=c) else: #detect french feminin accord and fossilize the word by modifying its structure to something unchanged by the normalization function normOutput = frenchFemininAccordsCodification(originalComment, isInput=True) #apply the spell corrector or other normalization function normOutput, dejavuDict = normalizationFunction( normOutput.lower(), dejavuDict, *args) #reverse back the code for the feminin accord into its original form normOutput = frenchFemininAccordsCodification(normOutput, isInput=False) #save into pandas series testOrigDf[index] = normOutput #dump normalized output if normOutPath != None: testOrigDf.to_csv(normOutPath, sep=u'\t', index=False) print(22222, c, len(testOrigDf), c / len(testOrigDf)) return testOrigDf
def makeTrainTestValidSetsFromTsv(origDf=u'./001corpus/inputOutputGs.tsv', ratioSizes=[0.2, 0.8], outputFolderPath=u'./002sets/'): ''' given the dataframe with the whole original input, returns 2 or 3 distinct dataframes containing a randomly selected elements corresponding to the given ratio sizes. The ratioSizes order must be: TRAIN - TEST - VALIDATION''' if outputFolderPath != None: outputFolderPath = u'{0}/'.format( outputFolderPath ) if outputFolderPath[-1] != u'/' else outputFolderPath #get the data frame origDf = myUtils.getDataFrameFromArgs(origDf) #get rid of tabs in the column content origDf = origDf.applymap(myUtils.replaceTabs) #get the actual sizes from the ratios nSizes = [int(r * len(origDf)) for r in ratioSizes ] #we avoid using the argument "frac" from "pd.sample" function #train-test set trainDf = origDf.sample(n=nSizes[0], replace=False) #train set remainingDf = origDf.iloc[~origDf.index.isin(trainDf.index)] testDf = remainingDf.sample(n=nSizes[1], replace=False) #test set #determine if it must return a train-test set or a train-validation-test set if len(nSizes) == 2: #dumping if outputFolderPath != None: trainDf[u'CommentIn'].to_csv( u'{0}trainOrig.tsv'.format(outputFolderPath), sep='\t', index=False) trainDf[u'CommentOut'].to_csv( u'{0}trainGS.tsv'.format(outputFolderPath), sep='\t', index=False) testDf[u'CommentIn'].to_csv( u'{0}testOrig.tsv'.format(outputFolderPath), sep='\t', index=False) testDf[u'CommentOut'].to_csv( u'{0}testGS.tsv'.format(outputFolderPath), sep='\t', index=False) return trainDf, testDf #train-validation-test set elif len(nSizes) == 3: remainingDf = remainingDf.iloc[~remainingDf.index.isin(testDf.index)] validDf = remainingDf.sample(frac=nSizes[2], replace=False) #dumping if outputFolderPath != None: trainDf[u'CommentIn'].to_csv( u'{0}trainOrig.tsv'.format(outputFolderPath), sep='\t', index=False) trainDf[u'CommentOut'].to_csv( u'{0}trainGS.tsv'.format(outputFolderPath), sep='\t', index=False) testDf[u'CommentIn'].to_csv( u'{0}testOrig.tsv'.format(outputFolderPath), sep='\t', index=False) testDf[u'CommentOut'].to_csv( u'{0}testGS.tsv'.format(outputFolderPath), sep='\t', index=False) validDf[u'CommentIn'].to_csv( u'{0}validationOrig.tsv'.format(outputFolderPath), sep='\t', index=False) validDf[u'CommentOut'].to_csv( u'{0}validationGS.tsv'.format(outputFolderPath), sep='\t', index=False) return trainDf, testDf, validDf raise IndexError( 'The number of ratio sizes is neither 2 nor 3. We require 2 ratio sizes to return a train and test set and 3 to return a train, test and validation sets.' )
def makeAlignLists(pathToTrainOrigTsv, pathToTrainGoldTsv, alignMostSimilar=False, origAlignPath=u'./003alignedTrainSet/alignedOrigLists.tsv', goldAlignPath=u'./003alignedTrainSet/alignedGoldLists.tsv'): ''' ''' trainOrigAlignedList = [] trainGoldAlignedList = [] #open the train dataframe from the path trainOrigDf = myUtils.getDataFrameFromArgs(pathToTrainOrigTsv, header=False)[0] trainGoldDf = myUtils.getDataFrameFromArgs(pathToTrainGoldTsv, header=False)[0] #get the gold standard data to which compare the training data for index, origComment in enumerate(trainOrigDf): goldComment = trainGoldDf[index] #align the 2 alignedListOrig, alignedListGold = myUtils.align2SameLangStrings( origComment, goldComment, windowSize=4, alignMostSimilar=alignMostSimilar) #add to the lists of aligned elements trainOrigAlignedList.append(alignedListOrig) trainGoldAlignedList.append(alignedListGold) #dump the lists myUtils.dumpRawLines(trainOrigAlignedList, origAlignPath, addNewline=True, rewrite=True) myUtils.dumpRawLines(trainGoldAlignedList, goldAlignPath, addNewline=True, rewrite=True) #in tab separated form def dumpStringTsv(alignedList, filePath): openFile = myUtils.createEmptyFile(filePath) for index, aligned in enumerate(alignedList): alignString = u'' for ind, elem in enumerate(aligned): alignString = u'{0}\t{1}'.format(alignString, elem) if ind != 0 else elem #dump if index != (len(alignedList) - 1): openFile.write(u'{0}\n'.format(alignString)) else: openFile.write(alignString) openFile.close() def dumpBothTsv(origAlign, goldAlign, filePath): openFile = myUtils.createEmptyFile(filePath) for index, origAligned in enumerate(origAlign): goldAligned = goldAlign[index] origAlignString = u'' goldAlignString = u'' for ind, origElem in enumerate(origAligned): goldElem = goldAligned[ind] origAlignString = u'{0}\t{1}'.format( origAlignString, origElem) if ind != 0 else origElem goldAlignString = u'{0}\t{1}'.format( goldAlignString, goldElem) if ind != 0 else goldElem #dump if index != (len(origAlign) - 1): openFile.write(u'orig:\t{0}\ngold:\t{1}\n'.format( origAlignString, goldAlignString)) else: openFile.write(u'orig:\t{0}\ngold:\t{1}'.format( origAlignString, goldAlignString)) openFile.close() #dump dumpStringTsv(trainOrigAlignedList, origAlignPath.replace(u'Lists.tsv', u'.tsv')) dumpStringTsv(trainGoldAlignedList, goldAlignPath.replace(u'Lists.tsv', u'.tsv')) dumpBothTsv(trainOrigAlignedList, trainGoldAlignedList, u'./003alignedTrainSet/alignment.tsv') return trainOrigAlignedList, trainGoldAlignedList
def makeAlignLists(pathToTrainOrigTsv, pathToTrainGoldTsv, origAlignPath=u'./003alignedTrainSet/alignedOrigLists.tsv', goldAlignPath=u'./003alignedTrainSet/alignedGoldLists.tsv', alignMostSimilar=False, dumpBoth=False, spacyModel=None): ''' ''' trainOrigAlignedList = [] trainGoldAlignedList = [] #open the train dataframe from the path trainOrigDf = myUtils.getDataFrameFromArgs(pathToTrainOrigTsv, header=False)[0] trainGoldDf = myUtils.getDataFrameFromArgs(pathToTrainGoldTsv, header=False)[0] #get the gold standard data to which compare the training data for index, origComment in enumerate(trainOrigDf): goldComment = trainGoldDf[index] #choose the tokenizer tokenizingFunct = partial(myUtils.multTokenizer, whatTokenizer=0, spacyModel=spacyModel) #align the 2 alignedListOrig, alignedListGold = myUtils.align2SameLangStrings( origComment, goldComment, windowSize=3, alignMostSimilar=alignMostSimilar, tokenizingFunct=tokenizingFunct) #add to the lists of aligned elements trainOrigAlignedList.append(alignedListOrig) trainGoldAlignedList.append(alignedListGold) #dump the lists myUtils.dumpRawLines(trainOrigAlignedList, origAlignPath, addNewline=True, rewrite=True) myUtils.dumpRawLines(trainGoldAlignedList, goldAlignPath, addNewline=True, rewrite=True) #in tab separated form def dumpStringTsv(alignedList, filePath): openFile = myUtils.createEmptyFile(filePath) for index, aligned in enumerate(alignedList): alignString = u'' for ind, elem in enumerate(aligned): alignString = u'{0}\t{1}'.format(alignString, elem) if ind != 0 else elem #dump if index != (len(alignedList) - 1): openFile.write(u'{0}\n'.format(alignString)) else: openFile.write(alignString) openFile.close() def dumpBothTsv(origAlign, goldAlign, filePath): openFile = myUtils.createEmptyFile(filePath) for index, origAligned in enumerate(origAlign): goldAligned = goldAlign[index] origAlignString = u'' goldAlignString = u'' for ind, origElem in enumerate(origAligned): goldElem = goldAligned[ind] origAlignString = u'{0}\t{1}'.format( origAlignString, origElem) if ind != 0 else origElem goldAlignString = u'{0}\t{1}'.format( goldAlignString, goldElem) if ind != 0 else goldElem #dump if index != (len(origAlign) - 1): openFile.write(u'orig:\t{0}\ngold:\t{1}\n'.format( origAlignString, goldAlignString)) else: openFile.write(u'orig:\t{0}\ngold:\t{1}'.format( origAlignString, goldAlignString)) openFile.close() #get the right path to dump the grepable alignments origPathList = (origAlignPath.replace(u'Lists', u'')).split(u'/') goldPathList = (origAlignPath.replace(u'Lists', u'')).split(u'/') #dump dumpStringTsv( trainOrigAlignedList, u'{0}/grepable/{1}'.format(u'/'.join(origPathList[:-1]), origPathList[-1])) dumpStringTsv( trainGoldAlignedList, u'{0}/grepable/{1}'.format(u'/'.join(goldPathList[:-1]), goldPathList[-1])) if dumpBoth != False: dumpBothTsv(trainOrigAlignedList, trainGoldAlignedList, u'./003alignedTrainSet/alignment.tsv') return trainOrigAlignedList, trainGoldAlignedList