def ororaZeAbbreviations(string, abbrDict=None): ''' ABBR --> ABBREVIATION ''' def makeReplacements( token ): #replace diacritical characters with non diacritical characters replacements = [(u'A', u'ÀÂ'), (u'E', u'ÉÈÊ'), (u'I', u'ÎÏ'), (u'O', u'Ô'), (u'U', u'ÙÛÜ'), (u'C', u'Ç')] for replaceTuple in replacements: for char in replaceTuple[1]: token = token.replace(char, replaceTuple[0]) token = token.replace(char.lower(), replaceTuple[0].lower()) return token #open the abbreviation dict if abbrDict == None: abbrDict = utilsOs.openJsonFileAsDict(u'./ororaAbbreviationDict.json') #open the abbr dict file if it's a path elif type(abbrDict) is str: abbrDict = utilsOs.openJsonFileAsDict(abbrDict) #abbreviation replacement stringList = string.split(u' ') stringList = [ token if makeReplacements(token).upper() not in abbrDict else abbrDict[makeReplacements(token).upper()] for token in stringList ] string = u' '.join(stringList) return string
def englishOrFrench(string): '''guesses the language of a string between english and french''' #presence of french specific diacriticals diacriticals = [ u'à', u'â', u'è', u'é', u'ê', u'ë', u'ù', u'û', u'ô', u'î', u'ï', u'ç', u'œ' ] for char in diacriticals: if char in string: return u'fr' #token detection unkTokendict = tokenDictMaker(string) #ngram char detection unkNgramDict = trigramDictMaker( string.replace(u'\n', u' ').replace(u'\r', u'')) #if the obtained dict is empty, unable to detect (probably just noise) if len(unkTokendict) == 0 or len(unkNgramDict) == 0: return 'unknown' #token scores frenchTokScore = langDictComparison( unkTokendict, utilsOs.openJsonFileAsDict(u'./utilsString/frTok.json')) englishTokScore = langDictComparison( unkTokendict, utilsOs.openJsonFileAsDict(u'./utilsString/enTok.json')) #ngram scores frenchNgramScore = langDictComparison( unkNgramDict, utilsOs.openJsonFileAsDict(u'./utilsString/fr3gram.json')) englishNgramScore = langDictComparison( unkNgramDict, utilsOs.openJsonFileAsDict(u'./utilsString/en3gram.json')) #the smaller the string (in tokens), the more we want to prioritize the token score instead of the ngram score if len(unkTokendict) < 5: ratioNgram = float(len(unkTokendict)) / 10.0 frenchTokScore = frenchTokScore * (1.0 - ratioNgram) frenchNgramScore = frenchNgramScore * ratioNgram englishTokScore = englishTokScore * (1.0 - ratioNgram) englishNgramScore = englishNgramScore * ratioNgram #we compare the sum of the language scores if (frenchTokScore + frenchNgramScore) < (englishTokScore + englishNgramScore): return u'fr' return u'en'
def applyOnNotFlaggedForNHours(n=1): schedule = u'/data/rali5/Tmp/alfonsda/workRali/004tradBureau/006appliedHeuristics/NOT-FLAGGED/heurSchedule.json' scheduleDict = utilsOs.openJsonFileAsDict(schedule) # apply for n hours for nId in list(scheduleDict.keys()[:n]): indexesToApply = scheduleDict[nId] applyHeuristicsOnNotFlaggedCorpus(indexesToApply, nId) # remove from the dict once we dump the scores del scheduleDict[nId] # save the remaining schedule dict utilsOs.dumpDictToJsonFile(scheduleDict, pathOutputFile=schedule, overwrite=True)
def countResultStrDistanceDict(dictResults): ''' counts the results in the str distance dict ''' if type(dictResults) is str: dictResults = utilsOs.openJsonFileAsDict(dictResults) for keyNb, neighDict in dictResults.items(): print(u'Edition distance of {0}:'.format(keyNb)) print(u'\tNb of nodes with neighbours of distance {0}: {1}'.format( keyNb, str(len(neighDict)))) totalNeigh = 0 for nodeKey, neighboursList in neighDict.items(): totalNeigh += len(neighboursList) print(u'\t\tMean nb of neighbours: {0}'.format( float(totalNeigh) / float(len(neighDict))))
def generateCmd(nHours=1, machineList=None): if machineList is None: machineList = [ u'octal06', u'octal03', u'octal04', u'octal05', u'octal07', u'octal17', u'ilar01', u'ilar02', u'bart2', u'bart3', u'bart4', u'bart5', u'bart6', u'bart7', u'bart10', u'kakia1', u'kakia2', u'kakib2', u'kakic2', u'kakid1', u'kakid2', u'kakie2', u'kakif1', u'kakif2' ] schedule = u'/data/rali5/Tmp/alfonsda/workRali/004tradBureau/006appliedHeuristics/NOT-FLAGGED/heurSchedule.json' scheduleDict = utilsOs.openJsonFileAsDict(schedule) scheduleIdList = list(scheduleDict.keys()) commandLns = [] for machine in machineList: commandLns.append( u'#########################################################') commandLns.append(u'ssh {0}'.format(machine)) commandLns.append(u'source .bashrc') commandLns.append(u'cd ~/Documents/workRALI/004tradBureau') simultaneousRuns = 4 # if the machine is high end, run more if machine in [ u'bart2', u'bart3', u'bart4', u'bart5', u'bart6', u'bart7', u'bart10', u'kakid2' ]: simultaneousRuns = 6 if machine in [ u'kakia1', u'kakia2', u'kakic2', u'kakid1', u'kakie2', u'kakif1', u'kakif2' ]: simultaneousRuns = 8 for n in range(simultaneousRuns): commandLns.append( u'python b009applyHeuristicsOnMC.py -ap True -w {0} -li {1} &'. format(n * 20, u'*'.join(scheduleIdList[:nHours]))) scheduleIdList = [ nId for nId in scheduleIdList if nId not in scheduleIdList[:nHours] ] # commandLns[-1] = commandLns[-1].replace(u' &', u'') commandLns.append(u'\nENDSSH\n') print(u'\n'.join(commandLns))
def getCommunityNameInferences(nodeDf, outputFilePath): ''' using a bag of words on jobtitles of the same community and on job titles and descriptions from existing ontologies (ESCO) we estimate what is the name of the community domain ''' inferencesDict = {} #bag of words of the esco ontology escoTree = utilsOs.openJsonFileAsDict(u'./jsonJobTaxonomies/escoTree.json') escoTreeBagOfWords = getEscoBowByLevel(escoTree) #bag of words of the communities in our ontology communityBagOfWords = getOntologyBowByCommunity(nodeDf) #add an empty column nodeDf[u'Infered_Community_Name_Lvl_0'] = np.nan #comparing intersection between esco bow and the communities bow for community, communityBow in communityBagOfWords.items(): #reset values of best intersection bestIntersection = {u'result': 0.0, u'set': None, u'name': u'00000000___'} for nb in reversed(range(1, 4)): for escoDomain, escoBow in escoTreeBagOfWords[nb].items(): #we intersect the 2 bag of words bowIntersection = communityBow.intersection(escoBow) #we evaluate if we are at the same level in the esco taxonomy the score we need to replace the best intersection #is > than the precedent, if we are one level upper, then the needed score is twice the best intersection score if len(bestIntersection[u'name'].split(u'___')[0]) == len(escoDomain.split(u'___')[0]): multiplier = 1.0 else: multiplier = (4.1 - (nb * 1.2)) #if the score is greater, we replace the previous best intersection with the new intersection if len(bowIntersection) > bestIntersection['result']*multiplier or bestIntersection['result'] == 0.0: bestIntersection[u'result'] = len(bowIntersection) bestIntersection[u'set'] = bowIntersection bestIntersection[u'name'] = escoDomain #saving the information inferencesDict[community] = bestIntersection nodeDf[u'Infered_Community_Name_Lvl_0'].loc[nodeDf[u'Community_Lvl_0'] == community] = str(bestIntersection['name']) #dump to file nodeDf.to_csv(outputFilePath, sep='\t', index=False) return inferencesDict
def applyOnSpecificId(idList): schedule = u'/data/rali5/Tmp/alfonsda/workRali/004tradBureau/006appliedHeuristics/NOT-FLAGGED/heurSchedule.json' while True: try: scheduleDict = utilsOs.openJsonFileAsDict(schedule) break except json.decoder.JSONDecodeError: print(u'Try again to access the json. {0}'.format(idList[0])) time.sleep(7) for nId in idList: try: indexesToApply = scheduleDict[nId] # apply heurs = [ u'nb', u'cog', u'len', u'fa', u'ion', u'sw', u'spell', u'url', u'mono', u'tabl', u'strBcks', u'punct', u'gibb' ] applyHeuristicsOnNotFlaggedCorpus(indexesToApply, nId, heuristicsList=heurs) except KeyError: print('ATTENTION: KEYERROR with id {0} #####'.format(nId)) print(u'FINISHED {0}...'.format(idList[0]))
def modifyConfigAndIndexFiles(pathToTheExportationEnvironment): ''' given the path to the sigma.js exportation environment (ending in the folder "network/"), it changes the config.json file and the index.html file so they show the graph the way intended ''' #copying config.json file configContent = {"type": "network","version": "1.0","data": "data.json","logo": {"file": "","link": "","text": ""},"text": {"more": "","intro": "","title": ""},"legend": {"edgeLabel": "","colorLabel": "","nodeLabel": ""},"features": {"search": True,"groupSelectorAttribute": True,"hoverBehavior": "default"},"informationPanel": {"groupByEdgeDirection": True,"imageAttribute": False},"sigma": {"drawingProperties": {"defaultEdgeType": "curve","defaultHoverLabelBGColor": "#002147","defaultLabelBGColor": "#ddd","activeFontStyle": "bold","defaultLabelColor": "#000","labelThreshold": 999,"defaultLabelHoverColor": "#fff","fontStyle": "bold","hoverFontStyle": "bold","defaultLabelSize": 14},"graphProperties": {"maxEdgeSize": 2,"minEdgeSize": 2,"minNodeSize": 0.25,"maxNodeSize": 2.5},"mouseProperties": {"maxRatio": 20,"minRatio": 0.75}}} pathConfigJson = u'{0}config.json'.format(pathToTheExportationEnvironment) if utilsOs.theFileExists(pathConfigJson) == True: os.remove(pathConfigJson) utilsOs.dumpDictToJsonFile(configContent, pathConfigJson) #getting the color information from the data file colorCommunityDict = {} dataDict = utilsOs.openJsonFileAsDict(u'{0}data.json'.format(pathToTheExportationEnvironment)) for nodeDict in dataDict[u'nodes']: try: if nodeDict[u'attributes'][u'community_lvl_0'] not in colorCommunityDict: colorCommunityDict[nodeDict[u'attributes'][u'community_lvl_0']] = u'\t\t\t<div style="color: {0};">● {1}</div>\n'.format(nodeDict[u'color'], nodeDict[u'attributes'][u'infered_community_name_lvl_0']) ''' ##################################################### #before I changed the names of the columns if nodeDict[u'attributes'][u'community'] not in colorCommunityDict: colorCommunityDict[nodeDict[u'attributes'][u'community']] = u'\t\t\t<div style="color: {0};">● {1}</div>\n'.format(nodeDict[u'color'], nodeDict[u'attributes'][u'infered_community_name']) ''' except KeyError: pass #modifying the index.html file with open(u'{0}index.html'.format(pathToTheExportationEnvironment)) as indexFile: fileLines = indexFile.readlines() for index, line in enumerate(fileLines): if line == u'\t\t<dt class="colours"></dt>\n': indexDivisor = index + 1 break fileLines = fileLines[:indexDivisor] + [u'\t\t<dd>\n'] + list(colorCommunityDict.values()) + [u'\t\t</dd>\n'] + fileLines[indexDivisor+1:] utilsOs.dumpRawLines(fileLines, u'{0}index.html'.format(pathToTheExportationEnvironment), addNewline=False, rewrite=True)
from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import sys, time, os, shutil sys.path.append(u'../utils') sys.path.append(u'./utils') import utilsOs, utilsString outputPath = u'/data/rali5/Tmp/alfonsda/workRali/004tradBureau/008originalDocumentsBt/' tempPath = u'{0}tmp/'.format(outputPath) outputDejaVusDocs = u'{0}json/dejaVusDocs.json'.format(outputPath) outputDejaVusRequestId = u'{0}json/dejaVusRequests.json'.format(outputPath) if utilsOs.theFileExists(outputDejaVusRequestId) is True: dejaVu = utilsOs.openJsonFileAsDict(outputDejaVusRequestId) else: dejaVu = set([]) # count the time the algorithm takes to run startTime = utilsOs.countTime() # get url, user and password with open(u'./b011pathUserPassword') as pup: url = pup.readline().replace(u'\n', u'') user = pup.readline().replace(u'\n', u'') passw = pup.readline().replace(u'\n', u'') # open the driver go to the page profile = webdriver.FirefoxProfile() profile.set_preference("webdriver_assume_untrusted_issuer", False)
def reformatFilesPreGiza(pathToEnFile, pathToFrFile, overwrite=True): """ make 2 vocabulary files (occurrence dict) in the format needed by giza++ or mgiza++ then reformats the corpus into a the format needed by giza++ or mgiza++ :param pathToEnFile: path to the english sentences file :param pathToFrFile: path to the french sentences file :return: None """ # prepare the output paths outputEnPath = prepareOutPutFile(pathToEnFile, fileName=u"sourceEn.vcb") outputFrPath = prepareOutPutFile(pathToFrFile, fileName=u"targetFr.vcb") outputPathGizaFormatCorpus = prepareOutPutFile( pathToEnFile, fileName=u"sentenceFile.giza") outputEnDictPath = prepareOutPutFile(pathToEnFile, fileName=u"en.json") outputFrDictPath = prepareOutPutFile(pathToEnFile, fileName=u"fr.json") outputSpDictPath = prepareOutPutFile(pathToEnFile, fileName=u"sp.json") outputEnIdDictPath = prepareOutPutFile(pathToEnFile, fileName=u"enId.json") outputFrIdDictPath = prepareOutPutFile(pathToEnFile, fileName=u"frId.json") # if there is not a file there yet, open the corpus Files, count the frequency of each token if overwrite is True or os.path.isfile(outputEnDictPath) is False: # make the frequency dict enTokFreqDict = makeFreqDict(pathToEnFile, lang=u"en") frTokFreqDict = makeFreqDict(pathToFrFile, lang=u"fr") # open the corpus files count the frequency of the sentence pairs spFreqDict = makeSPfreqDict(pathToEnFile, pathToFrFile) # sort the dict by freq orderedKeysValuesEn = sorted(enTokFreqDict.items(), key=lambda kv: (kv[1], kv[0]), reverse=True) orderedKeysValuesFr = sorted(frTokFreqDict.items(), key=lambda kv: (kv[1], kv[0]), reverse=True) # make the id dict enIdDict = makeIdDict(orderedKeysValuesEn) frIdDict = makeIdDict(orderedKeysValuesFr) # dump dicts utilsOs.dumpDictToJsonFile(enTokFreqDict, outputEnDictPath, overwrite) utilsOs.dumpDictToJsonFile(frTokFreqDict, outputFrDictPath, overwrite) utilsOs.dumpDictToJsonFile(spFreqDict, outputSpDictPath, overwrite) utilsOs.dumpDictToJsonFile(enIdDict, outputEnIdDictPath, overwrite) utilsOs.dumpDictToJsonFile(frIdDict, outputFrIdDictPath, overwrite) # if the file already exists or if overwrite is false else: enTokFreqDict = utilsOs.openJsonFileAsDict(outputEnDictPath) frTokFreqDict = utilsOs.openJsonFileAsDict(outputFrDictPath) spFreqDict = utilsOs.openJsonFileAsDict(outputSpDictPath) enIdDict = utilsOs.openJsonFileAsDict(outputEnIdDictPath) frIdDict = utilsOs.openJsonFileAsDict(outputFrIdDictPath) # sort the dict by freq orderedKeysValuesEn = sorted(enTokFreqDict.items(), key=lambda kv: (kv[1], kv[0]), reverse=True) orderedKeysValuesFr = sorted(frTokFreqDict.items(), key=lambda kv: (kv[1], kv[0]), reverse=True) # dump the empty tok voc file if overwrite is True: firstLine = u"1\tUNK\t0" utilsOs.createEmptyFile(outputEnPath, headerLine=firstLine) utilsOs.createEmptyFile(outputFrPath, headerLine=firstLine) utilsOs.createEmptyFile(outputPathGizaFormatCorpus) # dump the dict in the tok voc file for indKv, kv in enumerate(orderedKeysValuesEn): stringLine = u"{0}\t{1}\t{2}".format(indKv + 2, kv[0], kv[1]) utilsOs.appendLineToFile(stringLine, outputEnPath, addNewLine=True) for indKv, kv in enumerate(orderedKeysValuesFr): stringLine = u"{0}\t{1}\t{2}".format(indKv + 2, kv[0], kv[1]) utilsOs.appendLineToFile(stringLine, outputFrPath, addNewLine=True) # transform and dump the corpus into the GIZA format appendToDumpInGizaFormat(pathToEnFile, pathToFrFile, outputPathGizaFormatCorpus, enIdDict, frIdDict, spFreqDict) return outputEnPath, outputFrPath, outputPathGizaFormatCorpus, outputEnDictPath, outputFrDictPath, outputSpDictPath
def annotateFiles(listOfFilesPath=None, annotatedOutputFolder=u'./002manuallyAnnotated/', dumpSP=True): """ given a list of paths, manually show and annotate the sentence pairs """ referencePathLine = [] listOfAnnotations = [] # get the list containing the file paths if listOfFilesPath is None: listOfFilesPath = randomlySelectNDocsFromPath( b000path.getBtFolderPath(flagFolder=None), n=100) makeLocalFolderPaths(listOfFilesPath) elif type(listOfFilesPath) is str: if u'.json' in listOfFilesPath: listOfFilesPath = utilsOs.openJsonFileAsDict(listOfFilesPath) else: listOfFilesPath = [listOfFilesPath] # get rid of the files we have already annotated if utilsOs.theFileExists( u'{0}sampleReference.tsv'.format(annotatedOutputFolder)): refLines = utilsOs.readAllLinesFromFile( u'{0}sampleReference.tsv'.format(annotatedOutputFolder), noNewLineChar=True) annotatedFiles = set([line.split(u'\t')[0] for line in refLines]) listOfFilesPath = [ file for file in listOfFilesPath if file not in annotatedFiles ] # print the annotator cheat sheet print(""""0 - badly aligned \n\t0.0 - AMPLIFICATION: compensation, description, repetition or lang tendency to hypergraphy \n\t0.1 - ELISION: absence, omission, reduction or lang tendency to micrography \n\t0.2 - DISPLACEMENT: modification of the line order also modifying the order of the following lines \n\t0.3 - MISALIGNED and FOIBLE: alignment and quality errors \n1 - well aligned \n\t1.0 - ALIGNED and GOOD QUALITY: is aligned and shows no evident sing of translation imperfections \n\t1.1 - FOIBLE: imperfection in the translation quality""") # open each file in EN and FR and show it in the terminal for filePath in listOfFilesPath: print(u'############# {0} ##############'.format( filePath.replace( u'/data/rali8/Tmp/rali/bt/burtrad/corpus_renamed/', u''))) # get the path for the source and target fileSourcePath = u'{0}.fr'.format( filePath) if u'fr-en' in filePath else u'{0}.en'.format(filePath) fileTargetPath = u'{0}.en'.format( filePath) if u'fr-en' in filePath else u'{0}.fr'.format(filePath) with open(fileSourcePath) as fileSource: with open(fileTargetPath) as fileTarget: # show the context of the annotated sentence beforeSentSource = fileSource.readline() duringSentSource = fileSource.readline() beforeSentTarget = fileTarget.readline() duringSentTarget = fileTarget.readline() # annotate the first sentence pair listOfAnnotations = annotateFirstSP(beforeSentSource, duringSentSource, beforeSentTarget, duringSentTarget, listOfAnnotations, lineLength=137) # save the reference # if the filepath is the reference if u'burtrad' in filePath: referencePathLine.append(u'{0}\t{1}'.format(filePath, 0)) # otherwise we get it from a reference file else: with open(u'{0}.tsv'.format(filePath)) as refFile: refLns = [ ln.replace(u'\n', u'') for ln in refFile.readlines() ] referencePathLine.append(refLns[0]) # dump the first SP if dumpSP is True: enSent = beforeSentSource if u'.en' in fileSourcePath else beforeSentTarget frSent = beforeSentTarget if u'.en' in fileSourcePath else beforeSentSource utilsOs.appendLineToFile( enSent, u'{0}sample.en'.format(annotatedOutputFolder), addNewLine=False) utilsOs.appendLineToFile( frSent, u'{0}sample.fr'.format(annotatedOutputFolder), addNewLine=False) duringIndex = 1 # for each line while duringSentSource or duringSentTarget: # get the correct terminal line length lineLength = 137 - len(str(len(listOfAnnotations) + 1)) # get the sentences afterSentSource = fileSource.readline() afterSentTarget = fileTarget.readline() # color in red the during lines redDuringSource = u'\033[1;31m{0}\033[0m'.format( duringSentSource) redDuringTarget = u'\033[1;31m{0}\033[0m'.format( duringSentTarget) # print the sentences print(u'{0} - {1}'.format( len(listOfAnnotations) - 1, beforeSentSource)) print(u'{0} - {1}'.format( len(listOfAnnotations) - 1, beforeSentTarget)) print(u'{0} - {1}'.format(len(listOfAnnotations), redDuringSource)) print(u'{0} - {1}'.format(len(listOfAnnotations), redDuringTarget)) print(u'{0} - {1}'.format( len(listOfAnnotations) + 1, afterSentSource)) print(u'{0} - {1}'.format( len(listOfAnnotations) + 1, afterSentTarget)) print() # count if the lines that take the space of 2 lines longLines = getNbLongLines([ beforeSentSource, beforeSentTarget, duringSentSource, duringSentTarget, afterSentSource, afterSentTarget ], lineLength) # get the first part of the annotation (aligned or not) annotatorGeneralInput = input( u'Aligned-Misaligned annotation: ') # make sure to have the right general annotation while True: if annotatorGeneralInput in [ u'0', u'1', u'0.0', u'0.1', u'0.2', u'0.3', u'1.0', u'1.1', u'c', u'correct' ]: break else: utilsOs.moveUpAndLeftNLines(1, slowly=False) annotatorGeneralInput = input( u'Repeat annotation: ') if annotatorGeneralInput in [u'c', u'correct']: annotatorGeneralInput, listOfAnnotations = correctionToAnnotation( listOfAnnotations) # if we still need to specify what type of alignment or misalignment if annotatorGeneralInput in [u'0', u'1']: utilsOs.moveUpAndLeftNLines(1, slowly=False) # get the second part of the annotation (aligned or not) annotatorSpecificInput = input( u'Specific type annotation: ') typeAnswers = [ u'0', u'1', u'2', u'3' ] if annotatorGeneralInput == 0 else [u'0', u'1'] # make sure to have the right specific annotation while True: if annotatorSpecificInput in typeAnswers: break else: utilsOs.moveUpAndLeftNLines(1, slowly=False) annotatorSpecificInput = input( u'Repeat type annotation: ') # save to the list of annotations listOfAnnotations.append( float(u'{0}.{1}'.format(annotatorGeneralInput, annotatorSpecificInput))) # if the right answer was given in the right format right away else: # save to the list of annotations listOfAnnotations.append(float(annotatorGeneralInput)) # remove the lines from the terminal before getting to the next pair utilsOs.moveUpAndLeftNLines(14 + longLines, slowly=False) # erase all remainder of the previous sentences and go back up again for e in range(14 + longLines): print(u' ' * (lineLength + 4)) utilsOs.moveUpAndLeftNLines(14 + longLines, slowly=False) # next line source beforeSentSource = duringSentSource duringSentSource = afterSentSource # next line target beforeSentTarget = duringSentTarget duringSentTarget = afterSentTarget # append the reference to the file # if the filepath is the reference if u'burtrad' in filePath: referencePathLine.append(u'{0}\t{1}'.format( filePath, duringIndex)) # otherwise we get it from a reference file else: with open(u'{0}.tsv'.format(filePath)) as refFile: refLns = [ ln.replace(u'\n', u'') for ln in refFile.readlines() ] referencePathLine.append(refLns[duringIndex]) # add 1 to index duringIndex += 1 # dump the file line by line, to be sure in case of error # dump the reference utilsOs.dumpRawLines(referencePathLine, u'{0}sampleReference.tsv'.format( annotatedOutputFolder), addNewline=True, rewrite=True) # dump the annotation utilsOs.dumpRawLines(listOfAnnotations, u'{0}sampleAnnotation.tsv'.format( annotatedOutputFolder), addNewline=True, rewrite=True) # dump the SP if dumpSP is True: enSent = beforeSentSource if u'.en' in fileSourcePath else beforeSentTarget frSent = beforeSentTarget if u'.en' in fileSourcePath else beforeSentSource utilsOs.appendLineToFile( enSent, u'{0}sample.en'.format(annotatedOutputFolder), addNewLine=False) utilsOs.appendLineToFile( frSent, u'{0}sample.fr'.format(annotatedOutputFolder), addNewLine=False) # clear part of terminal utilsOs.moveUpAndLeftNLines(2, slowly=False)
def englishOrFrench(string): '''guesses the language of a string between english and french''' import utilsOs from langdetect.lang_detect_exception import LangDetectException #if the string is only made of numbers and non alphabetic characters we return 'unknown' if re.fullmatch( re.compile( r'([0-9]|-|\+|\!|\#|\$|%|&|\'|\*|\?|\.|\^|_|`|\||~|:|@)+'), string) != None: return u'unknown' #if more than 30% of the string characters is outside the ascii block and the french block, then it must be another language and we return 'unknown' if unicodeCodeScore( string, countSpaces=False, unicodeBlocksList=[[0, 255]]) < 0.7: return u'unknown' #if the string has a presence of unicode characters of french specific diacritics diacritics = [ 192, 194, [199, 203], 206, 207, 212, 140, 217, 219, 220, 159, 224, 226, [231, 235], 238, 239, 244, 156, 250, 251, 252, 255 ] if unicodeCodeScore( string, countSpaces=False, unicodeBlocksList=diacritics) > 0.0: return u'fr' #putting the string in lowercase improves the language detection functions string = string.lower() #use langdetect except if it returns something else than "en" or "fr", if the string is too short it's easy to mistake the string for another language try: lang = detect(string) if lang in [u'en', u'fr']: return lang #if there is an encoding or character induced error, we try the alternative language detection except LangDetectException: pass #alternative language detection #token detection unkTokendict = tokenDictMaker(string) #ngram char detection unkNgramDict = trigramDictMaker( string.replace(u'\n', u' ').replace(u'\r', u'')) #if the obtained dict is empty, unable to detect (probably just noise) if len(unkTokendict) == 0 or len(unkNgramDict) == 0: return u'unknown' #token scores frenchTokScore = langDictComparison( unkTokendict, utilsOs.openJsonFileAsDict(u'./utilsString/tokDict/frTok.json')) englishTokScore = langDictComparison( unkTokendict, utilsOs.openJsonFileAsDict(u'./utilsString/tokDict/enTok.json')) #ngram scores frenchNgramScore = langDictComparison( unkNgramDict, utilsOs.openJsonFileAsDict(u'./utilsString/charDict/frChar3gram.json')) englishNgramScore = langDictComparison( unkNgramDict, utilsOs.openJsonFileAsDict(u'./utilsString/charDict/enChar3gram.json')) #the smaller the string (in tokens), the more we want to prioritize the token score instead of the ngram score if len(unkTokendict) < 5: ratioNgram = float(len(unkTokendict)) / 10.0 frenchTokScore = frenchTokScore * (1.0 - ratioNgram) frenchNgramScore = frenchNgramScore * ratioNgram englishTokScore = englishTokScore * (1.0 - ratioNgram) englishNgramScore = englishNgramScore * ratioNgram #we compare the sum of the language scores if (frenchTokScore + frenchNgramScore) < (englishTokScore + englishNgramScore): return u'fr' return u'en'