예제 #1
0
def applyEvaluator(toBeEvalPath, goldStandPath, resultDfPath=None, verbose=False):
	''' '''
	positiveEvalCounter = 0
	#open the ororazed test dataframe from the path
	toBeEvalDf = myUtils.getDataFrameFromArgs(toBeEvalPath, header=False)[0]
	#open the goldStandard test dataframe from the path
	goldEvalDf = myUtils.getDataFrameFromArgs(goldStandPath, header=False)[0]
	#browse
	for index, goldStandard in goldEvalDf.iteritems():
		toBeEval = toBeEvalDf[index]
		#evaluation    if the normalized output corresponds to the gold standard
		positiveEvalCounter, evaluation = normalizationEvaluator(toBeEval, goldStandard, positiveEvalCounter)
		#save in dataframe, the gold standard df is now a result df
		goldEvalDf[index] = evaluation
	#dump result df
	if resultDfPath != None:
		goldEvalDf.to_csv(resultDfPath, sep='\t', index=False)
		#dump result
		with open(u'{0}.results'.format(resultDfPath), u'w', encoding=u'utf8') as resultFile:
			resultFile.write( u'NORMALIZATION RESULTS\nratio\texact positives\ttotal comments\n{0}\t{1}\t{2}'.format( (float(positiveEvalCounter)/float(len(goldEvalDf))), positiveEvalCounter, len(goldEvalDf) ) )
	#print if needed
	if verbose != False:
		print(u'ratio\texact positives\ttotal comments')
		print(float(positiveEvalCounter)/float(len(goldEvalDf)), positiveEvalCounter, len(goldEvalDf))
	return goldEvalDf 
예제 #2
0
def applyEvaluator(toBeEvalPath, goldStandPath, resultDfPath=None, verbose=False, spacyModel=None):
	''' '''
	positiveEvalCounter = 0
	positiveEvalCounterPPP = 0
	#open the ororazed test dataframe from the path
	toBeEvalDf = myUtils.getDataFrameFromArgs(toBeEvalPath, header=False)[0]
	#open the goldStandard test dataframe from the path
	goldEvalDf = myUtils.getDataFrameFromArgs(goldStandPath, header=False)[0]
	#make a result df
	resultDf = pd.DataFrame(0, index=np.arange(len(goldEvalDf)), columns=['exactMatch', 'pasAuPointPres'])
	#browse
	for index, goldStandard in goldEvalDf.iteritems():
		toBeEval = toBeEvalDf[index]
		#choose the tokenizer
		goldStandard = u' '.join(myUtils.multTokenizer(goldStandard, whatTokenizer=0, spacyModel=spacyModel))
		#evaluation    if the normalized output corresponds to the gold standard
		positiveEvalCounter, evaluation = normalizationEvaluator(toBeEval, goldStandard, positiveEvalCounter)
		positiveEvalCounterPPP, evaluationPpp = normalizationPppEvaluator(toBeEval, goldStandard, positiveEvalCounterPPP)
		#save in dataframe result df
		resultDf['exactMatch'][index] = evaluation
		resultDf['pasAuPointPres'][index] = evaluationPpp
	#dump result df
	if resultDfPath != None:
		resultDf.to_csv(resultDfPath, sep='\t', index=False)
		#dump result
		with open(u'{0}.results'.format(resultDfPath), u'w', encoding=u'utf8') as resultFile:
			resultFile.write( u'NORMALIZATION RESULTS\nratio\texact positives\ttotal comments\n{0}\t{1}\t{2}'.format( (float(positiveEvalCounter)/float(len(goldEvalDf))), positiveEvalCounter, len(goldEvalDf) ) )
		#dump trash results in a temp file
		trashSeries1 = toBeEvalDf.iloc[resultDf['exactMatch'].values == 0]
		trashSeries2 = goldEvalDf.iloc[resultDf['exactMatch'].values == 0]
		trashDf = pd.DataFrame(dict(SystemOutput=trashSeries1, GoldStandard=trashSeries2))
		trashPath = u'./008result/trash.tsv'
		trashGrepSystOutputPath = u'./008result/trashSystOutput.tsv'
		trashGrepSystGoldStandPath = u'./008result/trashGoldStand.tsv'
		#if it already exists append to it
		if myUtils.theFileExists(trashPath):
			#prepare dump excel readeable
			tempDf = myUtils.getDataFrameFromArgs(trashPath, header=True)
			trashDf = pd.concat([tempDf, trashDf]).drop_duplicates()
		#dump excel readeable
		trashDf.to_csv(trashPath, sep='\t', index=False, header=True)
		#dump grepable files
		trashDf[u'SystemOutput'].to_csv(trashGrepSystOutputPath, sep='\t', index=False, header=False)
		trashDf[u'GoldStandard'].to_csv(trashGrepSystGoldStandPath, sep='\t', index=False, header=False)
	#print if needed
	if verbose != False:
		print(u'EXACT MATCH :')
		print(u'ratio\texact positives\ttotal comments')
		print(float(positiveEvalCounter)/float(len(resultDf)), positiveEvalCounter, len(resultDf))		
		print(u'PAS AU POINT PRES :')
		print(u'ratio\texact positives\ttotal comments')
		print(float(positiveEvalCounterPPP)/float(len(resultDf)), positiveEvalCounterPPP, len(resultDf))
	return resultDf, ( float(positiveEvalCounter)/float(len(resultDf)), positiveEvalCounter, float(positiveEvalCounterPPP)/float(len(resultDf)), positiveEvalCounterPPP, len(resultDf) )
예제 #3
0
def applyLearnedDictPlusHumanDict(
    testOrigPath,
    normOutPath=None,
    learnedDictPath=u'./005learnedDict/ororaAbbreviationDict.json',
    humanDictPath=u'./005learnedDict/humanMadeDict/humanMadeOroraAbbreviationDict.json'
):
    ''' apply the normalization dict'''
    #open the dicts
    learnedDict = myUtils.openJsonFileAsDict(learnedDictPath)
    humanDict = myUtils.openJsonFileAsDict(humanDictPath)
    #open the test dataframe from the path
    testOrigDf = myUtils.getDataFrameFromArgs(testOrigPath, header=False)[0]
    for index, testComment in testOrigDf.iteritems():
        #use the human dict FIRST (priority to the human-made dicts)
        normOutput = ororaZeAbbreviations(testComment,
                                          learnedDict,
                                          listTheVariations=False)
        #use the learned dict
        normOutput = ororaZeAbbreviations(normOutput,
                                          humanDict,
                                          listTheVariations=False)
        #save into pandas series
        testOrigDf[index] = normOutput
    #dump normalized output
    if normOutPath != None:
        testOrigDf.to_csv(normOutPath, sep=u'\t', index=False)
    return testOrigDf
예제 #4
0
def applyOroraze(transformedFilePath, ororazedPath=None, advanced=False):
    ''' '''
    #open the transformed test dataframe from the path
    transformedFileDf = myUtils.getDataFrameFromArgs(transformedFilePath)
    #treat differently if it's a dataframe and if it's a pandas series
    if len(list(transformedFileDf)) == 1:  #if it's a pandas series object
        transformedFileDf = transformedFileDf[0]
        for index, transformedComment in transformedFileDf.iteritems():
            ororOutput = myUtils.ororaZe(transformedComment, advanced=advanced)
            #save to df
            transformedFileDf[index] = ororOutput
    else:  #if it's a dataframe
        #ororaze the original comments
        ororazePartial = functools.partial(myUtils.ororaZe, advanced=advanced)
        transformedFileDf[u'CommentIn'] = transformedFileDf[
            u'CommentIn'].apply(ororazePartial)
        #get rid of tabs in the column content
        transformedFileDf = transformedFileDf.applymap(myUtils.replaceTabs)
        #get rid of multiple spaces in both the original and the gold standard
        transformedFileDf = transformedFileDf.applymap(
            myUtils.eliminateMultipleSpaces)
    #dump normalized output
    if ororazedPath != None:
        transformedFileDf.to_csv(ororazedPath, sep='\t', index=False)
    return transformedFileDf
예제 #5
0
def makeSetsForCrossVal(origDf=u'./001ororazed/ororized.tsv',
                        nbSegmentations=10,
                        randomize=True,
                        outputFolderPath=None):
    ''' given a dataframe, returns subsets of said dataframe in order to 
	be used for cross-validation '''
    listOfDfs = []
    outputFolderPath = u'{0}/'.format(
        outputFolderPath) if outputFolderPath[-1] != u'/' else outputFolderPath
    #get the data frame
    origDf = myUtils.getDataFrameFromArgs(origDf)
    #if the nb of segmentation is a ratio (e.g., 0.25 for 25%), we transform it into an int nb of segmentations
    if type(nbSegmentations) is float and str(nbSegmentations)[0] == '0':
        nbSegmentations = int(1.0 / nbSegmentations)
    #get the size of each segment
    segmSize = float(len(origDf)) / float(nbSegmentations)
    #shuffle randomly the dataframe
    if randomize == True:
        origDf = origDf.sample(frac=1.0, replace=False)
    #populate the list with the segmented dataframes
    for n in range(1, int(nbSegmentations)):
        listOfDfs.append(origDf.iloc[int(segmSize * (n - 1)):int(segmSize *
                                                                 n)])
    #append the last segment, containing the remaining elements of the df, this number
    #might slightly vary from the expected and uniform size of the other segments
    listOfDfs.append(origDf.iloc[int(segmSize * (n)):])
    #dump the dataframes
    if outputFolderPath != None:
        myUtils.emptyTheFolder(outputFolderPath,
                               fileExtensionOrListOfExtensions=u'tsv')
        for n, df in enumerate(listOfDfs):
            df[u'CommentIn'].to_csv(u'{0}crossValidationOrig{1}.tsv'.format(
                outputFolderPath, n),
                                    sep='\t',
                                    index=False)
            df[u'CommentOut'].to_csv(u'{0}crossValidationGS{1}.tsv'.format(
                outputFolderPath, n),
                                     sep='\t',
                                     index=False)
    return listOfDfs


#makeTrainTestValidSetsFromTsv(origDf, ratioSizes, outputFolderPath)
# makeSetsForCrossVal(origDf, nbSegmentations, True, outputFolderPath)
예제 #6
0
def applyNormalisation(testOrigPath,
                       normOutPath=None,
                       normalization=None,
                       *args):
    ''' apply the normalization dict'''
    #if we are given a path to the place where the dict is
    if type(normalization) is str:
        normalization = myUtils.openJsonFileAsDict(normalization)
    #start an empty dejavuDict
    dejavuDict = {}
    c = 0
    #open the test dataframe from the path
    testOrigDf = myUtils.getDataFrameFromArgs(testOrigPath, header=False)[0]
    for index, testComment in testOrigDf.iteritems():
        if normalization == None:
            normOutput = testComment
        #use the dict as a normalization
        elif type(normalization) is dict:
            normOutput, c = ororaZeAbbreviations(testComment,
                                                 normalization,
                                                 listTheVariations=True,
                                                 c=c)
        else:
            #detect french feminin accord and fossilize the word by modifying its structure to something unchanged by the normalization function
            normOutput = frenchFemininAccordsCodification(originalComment,
                                                          isInput=True)
            #apply the spell corrector or other normalization function
            normOutput, dejavuDict = normalizationFunction(
                normOutput.lower(), dejavuDict, *args)
            #reverse back the code for the feminin accord into its original form
            normOutput = frenchFemininAccordsCodification(normOutput,
                                                          isInput=False)
        #save into pandas series
        testOrigDf[index] = normOutput
    #dump normalized output
    if normOutPath != None:
        testOrigDf.to_csv(normOutPath, sep=u'\t', index=False)
    print(22222, c, len(testOrigDf), c / len(testOrigDf))
    return testOrigDf
예제 #7
0
def makeTrainTestValidSetsFromTsv(origDf=u'./001corpus/inputOutputGs.tsv',
                                  ratioSizes=[0.2, 0.8],
                                  outputFolderPath=u'./002sets/'):
    ''' given the dataframe with the whole original input, returns 2 or 3 distinct
	dataframes containing a randomly selected elements corresponding to the given 
	ratio sizes. The ratioSizes order must be: TRAIN - TEST - VALIDATION'''
    if outputFolderPath != None:
        outputFolderPath = u'{0}/'.format(
            outputFolderPath
        ) if outputFolderPath[-1] != u'/' else outputFolderPath
    #get the data frame
    origDf = myUtils.getDataFrameFromArgs(origDf)
    #get rid of tabs in the column content
    origDf = origDf.applymap(myUtils.replaceTabs)
    #get the actual sizes from the ratios
    nSizes = [int(r * len(origDf)) for r in ratioSizes
              ]  #we avoid using the argument "frac" from "pd.sample" function
    #train-test set
    trainDf = origDf.sample(n=nSizes[0], replace=False)  #train set
    remainingDf = origDf.iloc[~origDf.index.isin(trainDf.index)]
    testDf = remainingDf.sample(n=nSizes[1], replace=False)  #test set
    #determine if it must return a train-test set or a train-validation-test set
    if len(nSizes) == 2:
        #dumping
        if outputFolderPath != None:
            trainDf[u'CommentIn'].to_csv(
                u'{0}trainOrig.tsv'.format(outputFolderPath),
                sep='\t',
                index=False)
            trainDf[u'CommentOut'].to_csv(
                u'{0}trainGS.tsv'.format(outputFolderPath),
                sep='\t',
                index=False)
            testDf[u'CommentIn'].to_csv(
                u'{0}testOrig.tsv'.format(outputFolderPath),
                sep='\t',
                index=False)
            testDf[u'CommentOut'].to_csv(
                u'{0}testGS.tsv'.format(outputFolderPath),
                sep='\t',
                index=False)
        return trainDf, testDf
    #train-validation-test set
    elif len(nSizes) == 3:
        remainingDf = remainingDf.iloc[~remainingDf.index.isin(testDf.index)]
        validDf = remainingDf.sample(frac=nSizes[2], replace=False)
        #dumping
        if outputFolderPath != None:
            trainDf[u'CommentIn'].to_csv(
                u'{0}trainOrig.tsv'.format(outputFolderPath),
                sep='\t',
                index=False)
            trainDf[u'CommentOut'].to_csv(
                u'{0}trainGS.tsv'.format(outputFolderPath),
                sep='\t',
                index=False)
            testDf[u'CommentIn'].to_csv(
                u'{0}testOrig.tsv'.format(outputFolderPath),
                sep='\t',
                index=False)
            testDf[u'CommentOut'].to_csv(
                u'{0}testGS.tsv'.format(outputFolderPath),
                sep='\t',
                index=False)
            validDf[u'CommentIn'].to_csv(
                u'{0}validationOrig.tsv'.format(outputFolderPath),
                sep='\t',
                index=False)
            validDf[u'CommentOut'].to_csv(
                u'{0}validationGS.tsv'.format(outputFolderPath),
                sep='\t',
                index=False)
        return trainDf, testDf, validDf
    raise IndexError(
        'The number of ratio sizes is neither 2 nor 3. We require 2 ratio sizes to return a train and test set and 3 to return a train, test and validation sets.'
    )
예제 #8
0
def makeAlignLists(pathToTrainOrigTsv,
                   pathToTrainGoldTsv,
                   alignMostSimilar=False,
                   origAlignPath=u'./003alignedTrainSet/alignedOrigLists.tsv',
                   goldAlignPath=u'./003alignedTrainSet/alignedGoldLists.tsv'):
    '''	'''
    trainOrigAlignedList = []
    trainGoldAlignedList = []
    #open the train dataframe from the path
    trainOrigDf = myUtils.getDataFrameFromArgs(pathToTrainOrigTsv,
                                               header=False)[0]
    trainGoldDf = myUtils.getDataFrameFromArgs(pathToTrainGoldTsv,
                                               header=False)[0]
    #get the gold standard data to which compare the training data
    for index, origComment in enumerate(trainOrigDf):
        goldComment = trainGoldDf[index]
        #align the 2
        alignedListOrig, alignedListGold = myUtils.align2SameLangStrings(
            origComment,
            goldComment,
            windowSize=4,
            alignMostSimilar=alignMostSimilar)
        #add to the lists of aligned elements
        trainOrigAlignedList.append(alignedListOrig)
        trainGoldAlignedList.append(alignedListGold)
    #dump the lists
    myUtils.dumpRawLines(trainOrigAlignedList,
                         origAlignPath,
                         addNewline=True,
                         rewrite=True)
    myUtils.dumpRawLines(trainGoldAlignedList,
                         goldAlignPath,
                         addNewline=True,
                         rewrite=True)

    #in tab separated form
    def dumpStringTsv(alignedList, filePath):
        openFile = myUtils.createEmptyFile(filePath)
        for index, aligned in enumerate(alignedList):
            alignString = u''
            for ind, elem in enumerate(aligned):
                alignString = u'{0}\t{1}'.format(alignString,
                                                 elem) if ind != 0 else elem
            #dump
            if index != (len(alignedList) - 1):
                openFile.write(u'{0}\n'.format(alignString))
            else:
                openFile.write(alignString)
        openFile.close()

    def dumpBothTsv(origAlign, goldAlign, filePath):
        openFile = myUtils.createEmptyFile(filePath)
        for index, origAligned in enumerate(origAlign):
            goldAligned = goldAlign[index]
            origAlignString = u''
            goldAlignString = u''
            for ind, origElem in enumerate(origAligned):
                goldElem = goldAligned[ind]
                origAlignString = u'{0}\t{1}'.format(
                    origAlignString, origElem) if ind != 0 else origElem
                goldAlignString = u'{0}\t{1}'.format(
                    goldAlignString, goldElem) if ind != 0 else goldElem
            #dump
            if index != (len(origAlign) - 1):
                openFile.write(u'orig:\t{0}\ngold:\t{1}\n'.format(
                    origAlignString, goldAlignString))
            else:
                openFile.write(u'orig:\t{0}\ngold:\t{1}'.format(
                    origAlignString, goldAlignString))
        openFile.close()

    #dump
    dumpStringTsv(trainOrigAlignedList,
                  origAlignPath.replace(u'Lists.tsv', u'.tsv'))
    dumpStringTsv(trainGoldAlignedList,
                  goldAlignPath.replace(u'Lists.tsv', u'.tsv'))
    dumpBothTsv(trainOrigAlignedList, trainGoldAlignedList,
                u'./003alignedTrainSet/alignment.tsv')
    return trainOrigAlignedList, trainGoldAlignedList
예제 #9
0
def makeAlignLists(pathToTrainOrigTsv,
                   pathToTrainGoldTsv,
                   origAlignPath=u'./003alignedTrainSet/alignedOrigLists.tsv',
                   goldAlignPath=u'./003alignedTrainSet/alignedGoldLists.tsv',
                   alignMostSimilar=False,
                   dumpBoth=False,
                   spacyModel=None):
    '''	'''
    trainOrigAlignedList = []
    trainGoldAlignedList = []
    #open the train dataframe from the path
    trainOrigDf = myUtils.getDataFrameFromArgs(pathToTrainOrigTsv,
                                               header=False)[0]
    trainGoldDf = myUtils.getDataFrameFromArgs(pathToTrainGoldTsv,
                                               header=False)[0]
    #get the gold standard data to which compare the training data
    for index, origComment in enumerate(trainOrigDf):
        goldComment = trainGoldDf[index]
        #choose the tokenizer
        tokenizingFunct = partial(myUtils.multTokenizer,
                                  whatTokenizer=0,
                                  spacyModel=spacyModel)
        #align the 2
        alignedListOrig, alignedListGold = myUtils.align2SameLangStrings(
            origComment,
            goldComment,
            windowSize=3,
            alignMostSimilar=alignMostSimilar,
            tokenizingFunct=tokenizingFunct)
        #add to the lists of aligned elements
        trainOrigAlignedList.append(alignedListOrig)
        trainGoldAlignedList.append(alignedListGold)
    #dump the lists
    myUtils.dumpRawLines(trainOrigAlignedList,
                         origAlignPath,
                         addNewline=True,
                         rewrite=True)
    myUtils.dumpRawLines(trainGoldAlignedList,
                         goldAlignPath,
                         addNewline=True,
                         rewrite=True)

    #in tab separated form
    def dumpStringTsv(alignedList, filePath):
        openFile = myUtils.createEmptyFile(filePath)
        for index, aligned in enumerate(alignedList):
            alignString = u''
            for ind, elem in enumerate(aligned):
                alignString = u'{0}\t{1}'.format(alignString,
                                                 elem) if ind != 0 else elem
            #dump
            if index != (len(alignedList) - 1):
                openFile.write(u'{0}\n'.format(alignString))
            else:
                openFile.write(alignString)
        openFile.close()

    def dumpBothTsv(origAlign, goldAlign, filePath):
        openFile = myUtils.createEmptyFile(filePath)
        for index, origAligned in enumerate(origAlign):
            goldAligned = goldAlign[index]
            origAlignString = u''
            goldAlignString = u''
            for ind, origElem in enumerate(origAligned):
                goldElem = goldAligned[ind]
                origAlignString = u'{0}\t{1}'.format(
                    origAlignString, origElem) if ind != 0 else origElem
                goldAlignString = u'{0}\t{1}'.format(
                    goldAlignString, goldElem) if ind != 0 else goldElem
            #dump
            if index != (len(origAlign) - 1):
                openFile.write(u'orig:\t{0}\ngold:\t{1}\n'.format(
                    origAlignString, goldAlignString))
            else:
                openFile.write(u'orig:\t{0}\ngold:\t{1}'.format(
                    origAlignString, goldAlignString))
        openFile.close()

    #get the right path to dump the grepable alignments
    origPathList = (origAlignPath.replace(u'Lists', u'')).split(u'/')
    goldPathList = (origAlignPath.replace(u'Lists', u'')).split(u'/')
    #dump
    dumpStringTsv(
        trainOrigAlignedList,
        u'{0}/grepable/{1}'.format(u'/'.join(origPathList[:-1]),
                                   origPathList[-1]))
    dumpStringTsv(
        trainGoldAlignedList,
        u'{0}/grepable/{1}'.format(u'/'.join(goldPathList[:-1]),
                                   goldPathList[-1]))
    if dumpBoth != False:
        dumpBothTsv(trainOrigAlignedList, trainGoldAlignedList,
                    u'./003alignedTrainSet/alignment.tsv')
    return trainOrigAlignedList, trainGoldAlignedList