def applyLearnedDictPlusHumanDict( testOrigPath, normOutPath=None, learnedDictPath=u'./005learnedDict/ororaAbbreviationDict.json', humanDictPath=u'./005learnedDict/humanMadeDict/humanMadeOroraAbbreviationDict.json' ): ''' apply the normalization dict''' #open the dicts learnedDict = myUtils.openJsonFileAsDict(learnedDictPath) humanDict = myUtils.openJsonFileAsDict(humanDictPath) #open the test dataframe from the path testOrigDf = myUtils.getDataFrameFromArgs(testOrigPath, header=False)[0] for index, testComment in testOrigDf.iteritems(): #use the human dict FIRST (priority to the human-made dicts) normOutput = ororaZeAbbreviations(testComment, learnedDict, listTheVariations=False) #use the learned dict normOutput = ororaZeAbbreviations(normOutput, humanDict, listTheVariations=False) #save into pandas series testOrigDf[index] = normOutput #dump normalized output if normOutPath != None: testOrigDf.to_csv(normOutPath, sep=u'\t', index=False) return testOrigDf
def dumpDictIntersectAutoAndHuman(autoDict, humanDict=None, outputPath=None): ''' given 2 dicts makes the intersection of the 2 (keys in common in both of them), then returns and dumps it ''' newDict = {} nonIntersectAutoDict = {} #get the dicts if they are not given directly as dicts if humanDict == None: humanDict = myUtils.openJsonFileAsDict( u'./005learnedDict/humanMadeDict/humanMadeOroraAbbreviationDict.json' ) elif type(humanDict) is str: humanDict = myUtils.openJsonFileAsDict(humanDict) if type(autoDict) is str: autoDict = myUtils.openJsonFileAsDict(autoDict) # get the intersection of keys in both dicts intersection = set(autoDict.keys()) & set(humanDict.keys()) for key in intersection: newDict[key] = autoDict[key] # get the auto keys not appearing in the intersection for key, val in autoDict.items(): if key not in intersection: nonIntersectAutoDict[key] = val #dump it if outputPath == None: outputPath = u'./005learnedDict/intersectionHumanAutoDict/humanAutoDict.json' myUtils.dumpDictToJsonFile(newDict, pathOutputFile=outputPath, overwrite=True) myUtils.dumpDictToJsonFile(nonIntersectAutoDict, pathOutputFile=outputPath.replace( u'HumanAutoDict/', u'HumanAutoDict/nonIntersect/').replace( u'.json', u'NonIntersect.json'), overwrite=True) return newDict, nonIntersectAutoDict
def applyNormalisation(testOrigPath, normOutPath=None, normalization=None, *args): ''' apply the normalization dict''' #if we are given a path to the place where the dict is if type(normalization) is str: normalization = myUtils.openJsonFileAsDict(normalization) #start an empty dejavuDict dejavuDict = {} c = 0 #open the test dataframe from the path testOrigDf = myUtils.getDataFrameFromArgs(testOrigPath, header=False)[0] for index, testComment in testOrigDf.iteritems(): if normalization == None: normOutput = testComment #use the dict as a normalization elif type(normalization) is dict: normOutput, c = ororaZeAbbreviations(testComment, normalization, listTheVariations=True, c=c) else: #detect french feminin accord and fossilize the word by modifying its structure to something unchanged by the normalization function normOutput = frenchFemininAccordsCodification(originalComment, isInput=True) #apply the spell corrector or other normalization function normOutput, dejavuDict = normalizationFunction( normOutput.lower(), dejavuDict, *args) #reverse back the code for the feminin accord into its original form normOutput = frenchFemininAccordsCodification(normOutput, isInput=False) #save into pandas series testOrigDf[index] = normOutput #dump normalized output if normOutPath != None: testOrigDf.to_csv(normOutPath, sep=u'\t', index=False) print(22222, c, len(testOrigDf), c / len(testOrigDf)) return testOrigDf
def ororaZeAbbreviations(string, abbrDict=None, listTheVariations=False, c=0, e=0): ''' ABBR --> ABBREVIATION ''' def makeReplacements( token ): #replace diacritical characters with non diacritical characters replacements = [(u'A', u'ÀÂ'), (u'E', u'ÉÈÊ'), (u'I', u'ÎÏ'), (u'O', u'Ô'), (u'U', u'ÙÛÜ'), (u'C', u'Ç')] for replaceTuple in replacements: for char in replaceTuple[1]: token = token.replace(char, replaceTuple[0]) token = token.replace(char.lower(), replaceTuple[0].lower()) return token #open the abbreviation dict if abbrDict == None: abbrDict = myUtils.openJsonFileAsDict( u'./005learnedDict/ororaAbbreviationDict.json') #open the abbr dict file if it's a path elif type(abbrDict) is str: abbrDict = myUtils.openJsonFileAsDict(abbrDict) #abbreviation replacement stringList = string.split(u' ') if type(abbrDict[list(abbrDict.keys())[0]]) is list: for index, token in enumerate(stringList): #if the token is in the dict if makeReplacements(token).upper() in abbrDict: minScore = 0.55 #if we search only for the first and most common option if listTheVariations == False: #if the token has a reliable score if abbrDict[makeReplacements( token).upper()][0][1] >= minScore: stringList[index] = abbrDict[makeReplacements( token).upper()][0][0] #if we want to return a list of all the possibilities in decreasing order else: for var in abbrDict[makeReplacements(token).upper()]: if var[1] >= minScore: e = 1 variations = [ var[0] for var in abbrDict[makeReplacements(token).upper()] if var[1] >= minScore ] stringList[index] = u'¤'.join(variations) if len( variations) != 0 else makeReplacements(token).upper() if e == 1: c += 1 #stringList = [ token if makeReplacements(token).upper() not in abbrDict else abbrDict[makeReplacements(token).upper()][0][0] for token in stringList ] else: stringList = [ token if makeReplacements(token).upper() not in abbrDict else abbrDict[makeReplacements(token).upper()] for token in stringList ] #elimination of the empty elements u'' or u'∅' if they got in (somehow) #stringList = [ token for token in stringList if token not in [u'', u'∅'] ] string = u' '.join(stringList) return string, c
def makeDictFromTsvTrain(pathNonMatch, trainedDict={}, outputDictFilePath=False): ''' ''' #open the dict if type(trainedDict) == str: trainedDict = myUtils.openJsonFileAsDict(trainedDict) #open as a list the non identical elements between the original and the gold if type(pathNonMatch) is str: with open(pathNonMatch) as nonMatchFile: nonMatchList = [] line = nonMatchFile.readline() while line: nonMatchList.append(ast.literal_eval(line.replace(u', \n', u'').replace(u',\n', u'').replace(u'\n', u''))) line = nonMatchFile.readline() #get the gold standard data to which compare the training data for nonMatchingAlignment in nonMatchList: #if the list is not empty if nonMatchingAlignment: for nonMatchTupl in nonMatchingAlignment: #use the original token as a key trainedDict[nonMatchTupl[0]] = trainedDict.get( nonMatchTupl[0], list() )+[nonMatchTupl[1]] #clean the dict for origKey, goldValList in dict(trainedDict).items(): #eliminate all the elements in the dict that have an empty symbol as a value if set(goldValList) == {u'∅'}: del trainedDict[origKey] #########elif len(goldValList) != 1: ######### del trainedDict[origKey] #eliminate the elements containing a number character elif myUtils.detectNbChar(origKey) == True: del trainedDict[origKey] else: #change the goldValList into a sorted list with a count of the recurrences goldValSortedList = [] #eliminate the empty symbol from the list goldValList = [ elem for elem in goldValList if elem not in [u'∅', u''] ] for goldVal in set(goldValList): #count their instances counter = 0 for gv in goldValList: if goldVal == gv: counter += 1 #add the token and the normalized score goldValSortedList.append( (goldVal, float(counter)/float(len(goldValList))) ) #sort the list goldValSortedList.sort(reverse=True, key=lambda x: x[1]) trainedDict[origKey] = goldValSortedList ''' #eliminate all the elements in the dict that have multiple possible outputs or if the value is an empty symbol if len(goldValList) != 1 or set(goldValList) == {u'∅'}: del trainedDict[origKey] #eleiminate the elements containing a number character elif myUtils.detectNbChar(origKey) == True: del trainedDict[origKey] else: trainedDict[origKey] = goldValList[0] ''' ''' #eliminate all the elements in the dict that have multiple possible outputs or if the value is an empty symbol if len(goldValList) != 1 or set(goldValList) == {u'∅'}: del trainedDict[origKey] #eleiminate the elements containing a number character elif myUtils.detectNbChar(origKey) == True: del trainedDict[origKey] else: trainedDict[origKey] = goldValList[0] ''' print(len(trainedDict)) #dump the dict if outputDictFilePath != False: myUtils.dumpDictToJsonFile(trainedDict, outputDictFilePath, overwrite=True) return trainedDict
def makeDictFromTsvTrain(pathNonMatchList, existingDict={}, pathMatch=None, pathNonMatch=None, language=u'fr', outputDictFilePath=False): ''' ''' #open the trained dict if type(existingDict) == str: trainedDict = myUtils.openJsonFileAsDict(trainedDict) else: trainedDict = dict(existingDict) #open the matching dicts if pathMatch != None: matchCounterDict = myUtils.openJsonFileAsDict(pathMatch) nonMatchCounterDict = myUtils.openJsonFileAsDict(pathNonMatch) #open as a list the non identical elements between the original and the gold if type(pathNonMatchList) is str: with open(pathNonMatchList) as nonMatchFile: nonMatchList = [] line = nonMatchFile.readline() while line: nonMatchList.append( ast.literal_eval( line.replace(u', \n', u'').replace(u',\n', u'').replace(u'\n', u''))) line = nonMatchFile.readline() #get the gold standard data to which compare the training data for nonMatchingAlignment in nonMatchList: #if the list is not empty if nonMatchingAlignment: for nonMatchTupl in nonMatchingAlignment: #use the original token as a key trainedDict[nonMatchTupl[0]] = trainedDict.get( nonMatchTupl[0], list()) + [nonMatchTupl[1]] #first cleaning: eliminate the elements that appear a lot less than the unchanged variant if pathMatch != None: for origKey, goldValList in dict(trainedDict).items(): try: nbMatch = matchCounterDict[origKey] nbNonMatch = 0 for indexGold, goldVal in enumerate(goldValList): nbNonMatch = nonMatchCounterDict[origKey][goldVal] ###trainedDict[origKey] = goldValList #don't remove anything, no matter how uncommon #remove the gold value if it's not a very common unmatching replacement if float(nbMatch) / float(nbMatch + nbNonMatch) >= 0.55: del goldValList[indexGold] #if the list is empty delete it if len(goldValList) == 0: del trainedDict[origKey] else: trainedDict[origKey] = goldValList elif nbNonMatch < 10: del goldValList[indexGold] #if the list is empty delete it if len(goldValList) == 0: del trainedDict[origKey] except KeyError: pass #second cleaning, get rid of the empty element if u'' in trainedDict: del trainedDict[u''] #third cleaning, eliminate the elements whose value is way smaller than its key and #of the entries whose key have empirically shown to induce errors and delete them for origKey, goldValList in dict(trainedDict).items(): # if the key is undesireable if undesireableKey(origKey) == True: del trainedDict[origKey] else: indexesToDel = [] for indexVal, goldVal in enumerate(list(goldValList)): # if the val is much smaller than the key if float(len(goldVal)) / float(len(origKey)) <= 0.4: indexesToDel.append(indexVal) # if the val is undesireable elif undesireableVal(origKey, goldVal) == True: indexesToDel.append(indexVal) #we delete the indexes we detected as causing problems for indexVal in reversed(indexesToDel): del goldValList[indexVal] #if we have not produced an empty list, assign it to the trained dict if len(goldValList) == 0: del trainedDict[origKey] else: trainedDict[origKey] = goldValList #fourth cleaning, ruled based cleaning for origKey, goldValList in dict(trainedDict).items(): #eliminate all the elements in the dict that have an empty symbol as a value if set(goldValList) == {u'∅'} or origKey == u'∅': del trainedDict[origKey] #eliminate all ambiguous entries ###elif len(goldValList) != 1: ### del trainedDict[origKey] #eliminate the elements containing a number character elif myUtils.detectNbChar(origKey) == True: del trainedDict[origKey] #eliminate the elements whose key is a stop-word elif myUtils.isTokenStopWord(origKey, language) == True: del trainedDict[origKey] else: #change the goldValList into a sorted list with a count of the recurrences goldValSortedList = [] #eliminate the empty symbol from the list goldValList = [ elem for elem in goldValList if elem not in [u'∅', u''] ] for goldVal in set(goldValList): #count their instances counter = 0 for gv in goldValList: if goldVal == gv: counter += 1 #add the token and the normalized score goldValSortedList.append( (goldVal, float(counter) / float(len(goldValList)))) #sort the list and add to the dict if len(goldValSortedList) != 0: goldValSortedList.sort(reverse=True, key=lambda x: x[1]) trainedDict[origKey] = goldValSortedList #dump the dict if outputDictFilePath != False: myUtils.dumpDictToJsonFile(trainedDict, outputDictFilePath, overwrite=True) return trainedDict