def ororaZeAbbreviations(string, abbrDict=None):
    ''' 
	ABBR --> ABBREVIATION
	'''
    def makeReplacements(
        token
    ):  #replace diacritical characters with non diacritical characters
        replacements = [(u'A', u'ÀÂ'), (u'E', u'ÉÈÊ'), (u'I', u'ÎÏ'),
                        (u'O', u'Ô'), (u'U', u'ÙÛÜ'), (u'C', u'Ç')]
        for replaceTuple in replacements:
            for char in replaceTuple[1]:
                token = token.replace(char, replaceTuple[0])
                token = token.replace(char.lower(), replaceTuple[0].lower())
        return token

    #open the abbreviation dict
    if abbrDict == None:
        abbrDict = utilsOs.openJsonFileAsDict(u'./ororaAbbreviationDict.json')
    #open the abbr dict file if it's a path
    elif type(abbrDict) is str:
        abbrDict = utilsOs.openJsonFileAsDict(abbrDict)
    #abbreviation replacement
    stringList = string.split(u' ')
    stringList = [
        token if makeReplacements(token).upper() not in abbrDict else
        abbrDict[makeReplacements(token).upper()] for token in stringList
    ]
    string = u' '.join(stringList)
    return string
예제 #2
0
def englishOrFrench(string):
    '''guesses the language of a string between english and french'''
    #presence of french specific diacriticals
    diacriticals = [
        u'à', u'â', u'è', u'é', u'ê', u'ë', u'ù', u'û', u'ô', u'î', u'ï', u'ç',
        u'œ'
    ]
    for char in diacriticals:
        if char in string:
            return u'fr'
    #token detection
    unkTokendict = tokenDictMaker(string)
    #ngram char detection
    unkNgramDict = trigramDictMaker(
        string.replace(u'\n', u' ').replace(u'\r', u''))
    #if the obtained dict is empty, unable to detect (probably just noise)
    if len(unkTokendict) == 0 or len(unkNgramDict) == 0:
        return 'unknown'
    #token scores
    frenchTokScore = langDictComparison(
        unkTokendict, utilsOs.openJsonFileAsDict(u'./utilsString/frTok.json'))
    englishTokScore = langDictComparison(
        unkTokendict, utilsOs.openJsonFileAsDict(u'./utilsString/enTok.json'))
    #ngram scores
    frenchNgramScore = langDictComparison(
        unkNgramDict,
        utilsOs.openJsonFileAsDict(u'./utilsString/fr3gram.json'))
    englishNgramScore = langDictComparison(
        unkNgramDict,
        utilsOs.openJsonFileAsDict(u'./utilsString/en3gram.json'))
    #the smaller the string (in tokens), the more we want to prioritize the token score instead of the ngram score
    if len(unkTokendict) < 5:
        ratioNgram = float(len(unkTokendict)) / 10.0
        frenchTokScore = frenchTokScore * (1.0 - ratioNgram)
        frenchNgramScore = frenchNgramScore * ratioNgram
        englishTokScore = englishTokScore * (1.0 - ratioNgram)
        englishNgramScore = englishNgramScore * ratioNgram
    #we compare the sum of the language scores
    if (frenchTokScore + frenchNgramScore) < (englishTokScore +
                                              englishNgramScore):
        return u'fr'
    return u'en'
def applyOnNotFlaggedForNHours(n=1):
    schedule = u'/data/rali5/Tmp/alfonsda/workRali/004tradBureau/006appliedHeuristics/NOT-FLAGGED/heurSchedule.json'
    scheduleDict = utilsOs.openJsonFileAsDict(schedule)
    # apply for n hours
    for nId in list(scheduleDict.keys()[:n]):
        indexesToApply = scheduleDict[nId]
        applyHeuristicsOnNotFlaggedCorpus(indexesToApply, nId)
        # remove from the dict once we dump the scores
        del scheduleDict[nId]
    # save the remaining schedule dict
    utilsOs.dumpDictToJsonFile(scheduleDict,
                               pathOutputFile=schedule,
                               overwrite=True)
예제 #4
0
def countResultStrDistanceDict(dictResults):
    '''
	counts the results in the str distance dict
	'''
    if type(dictResults) is str:
        dictResults = utilsOs.openJsonFileAsDict(dictResults)
    for keyNb, neighDict in dictResults.items():
        print(u'Edition distance of {0}:'.format(keyNb))
        print(u'\tNb of nodes with neighbours of distance {0}: {1}'.format(
            keyNb, str(len(neighDict))))
        totalNeigh = 0
        for nodeKey, neighboursList in neighDict.items():
            totalNeigh += len(neighboursList)
        print(u'\t\tMean nb of neighbours: {0}'.format(
            float(totalNeigh) / float(len(neighDict))))
def generateCmd(nHours=1, machineList=None):
    if machineList is None:
        machineList = [
            u'octal06', u'octal03', u'octal04', u'octal05', u'octal07',
            u'octal17', u'ilar01', u'ilar02', u'bart2', u'bart3', u'bart4',
            u'bart5', u'bart6', u'bart7', u'bart10', u'kakia1', u'kakia2',
            u'kakib2', u'kakic2', u'kakid1', u'kakid2', u'kakie2', u'kakif1',
            u'kakif2'
        ]
    schedule = u'/data/rali5/Tmp/alfonsda/workRali/004tradBureau/006appliedHeuristics/NOT-FLAGGED/heurSchedule.json'
    scheduleDict = utilsOs.openJsonFileAsDict(schedule)
    scheduleIdList = list(scheduleDict.keys())
    commandLns = []
    for machine in machineList:
        commandLns.append(
            u'#########################################################')
        commandLns.append(u'ssh {0}'.format(machine))
        commandLns.append(u'source .bashrc')
        commandLns.append(u'cd ~/Documents/workRALI/004tradBureau')
        simultaneousRuns = 4
        # if the machine is high end, run more
        if machine in [
                u'bart2', u'bart3', u'bart4', u'bart5', u'bart6', u'bart7',
                u'bart10', u'kakid2'
        ]:
            simultaneousRuns = 6
        if machine in [
                u'kakia1', u'kakia2', u'kakic2', u'kakid1', u'kakie2',
                u'kakif1', u'kakif2'
        ]:
            simultaneousRuns = 8
        for n in range(simultaneousRuns):
            commandLns.append(
                u'python b009applyHeuristicsOnMC.py -ap True -w {0} -li {1} &'.
                format(n * 20, u'*'.join(scheduleIdList[:nHours])))
            scheduleIdList = [
                nId for nId in scheduleIdList
                if nId not in scheduleIdList[:nHours]
            ]
        # commandLns[-1] = commandLns[-1].replace(u' &', u'')
        commandLns.append(u'\nENDSSH\n')
    print(u'\n'.join(commandLns))
예제 #6
0
def getCommunityNameInferences(nodeDf, outputFilePath):
	''' 
	using a bag of words on jobtitles of the same community and on
	job titles and descriptions from existing ontologies (ESCO)
	we estimate what is the name of the community domain
	'''
	inferencesDict = {}
	#bag of words of the esco ontology
	escoTree = utilsOs.openJsonFileAsDict(u'./jsonJobTaxonomies/escoTree.json')
	escoTreeBagOfWords = getEscoBowByLevel(escoTree)
	#bag of words of the communities in our ontology
	communityBagOfWords = getOntologyBowByCommunity(nodeDf)
	#add an empty column
	nodeDf[u'Infered_Community_Name_Lvl_0'] = np.nan
	#comparing intersection between esco bow and the communities bow
	for community, communityBow in communityBagOfWords.items():
		#reset values of best intersection
		bestIntersection = {u'result': 0.0, u'set': None, u'name': u'00000000___'}
		for nb in reversed(range(1, 4)):
			for escoDomain, escoBow in escoTreeBagOfWords[nb].items():
				#we intersect the 2 bag of words
				bowIntersection = communityBow.intersection(escoBow)
				#we evaluate if we are at the same level in the esco taxonomy the score we need to replace the best intersection 
				#is > than the precedent, if we are one level upper, then the needed score is twice the best intersection score
				if len(bestIntersection[u'name'].split(u'___')[0]) == len(escoDomain.split(u'___')[0]):
					multiplier = 1.0
				else:
					multiplier = (4.1 - (nb * 1.2))
				#if the score is greater, we replace the previous best intersection with the new intersection
				if len(bowIntersection) > bestIntersection['result']*multiplier or bestIntersection['result'] == 0.0:
					bestIntersection[u'result'] = len(bowIntersection)
					bestIntersection[u'set'] = bowIntersection
					bestIntersection[u'name'] = escoDomain
		#saving the information
		inferencesDict[community] = bestIntersection
		nodeDf[u'Infered_Community_Name_Lvl_0'].loc[nodeDf[u'Community_Lvl_0'] == community] = str(bestIntersection['name'])
	#dump to file
	nodeDf.to_csv(outputFilePath, sep='\t', index=False)
	return inferencesDict
def applyOnSpecificId(idList):
    schedule = u'/data/rali5/Tmp/alfonsda/workRali/004tradBureau/006appliedHeuristics/NOT-FLAGGED/heurSchedule.json'
    while True:
        try:
            scheduleDict = utilsOs.openJsonFileAsDict(schedule)
            break
        except json.decoder.JSONDecodeError:
            print(u'Try again to access the json. {0}'.format(idList[0]))
            time.sleep(7)
    for nId in idList:
        try:
            indexesToApply = scheduleDict[nId]
            # apply
            heurs = [
                u'nb', u'cog', u'len', u'fa', u'ion', u'sw', u'spell', u'url',
                u'mono', u'tabl', u'strBcks', u'punct', u'gibb'
            ]
            applyHeuristicsOnNotFlaggedCorpus(indexesToApply,
                                              nId,
                                              heuristicsList=heurs)
        except KeyError:
            print('ATTENTION: KEYERROR with id {0} #####'.format(nId))
    print(u'FINISHED {0}...'.format(idList[0]))
예제 #8
0
def modifyConfigAndIndexFiles(pathToTheExportationEnvironment):
	'''
	given the path to the sigma.js exportation environment (ending in 
	the folder "network/"), it changes the config.json file and the index.html
	file so they show the graph the way intended
	'''
	#copying config.json file
	configContent = {"type": "network","version": "1.0","data": "data.json","logo": {"file": "","link": "","text": ""},"text": {"more": "","intro": "","title": ""},"legend": {"edgeLabel": "","colorLabel": "","nodeLabel": ""},"features": {"search": True,"groupSelectorAttribute": True,"hoverBehavior": "default"},"informationPanel": {"groupByEdgeDirection": True,"imageAttribute": False},"sigma": {"drawingProperties": {"defaultEdgeType": "curve","defaultHoverLabelBGColor": "#002147","defaultLabelBGColor": "#ddd","activeFontStyle": "bold","defaultLabelColor": "#000","labelThreshold": 999,"defaultLabelHoverColor": "#fff","fontStyle": "bold","hoverFontStyle": "bold","defaultLabelSize": 14},"graphProperties": {"maxEdgeSize": 2,"minEdgeSize": 2,"minNodeSize": 0.25,"maxNodeSize": 2.5},"mouseProperties": {"maxRatio": 20,"minRatio": 0.75}}}
	pathConfigJson = u'{0}config.json'.format(pathToTheExportationEnvironment)
	if utilsOs.theFileExists(pathConfigJson) == True:
		os.remove(pathConfigJson)
	utilsOs.dumpDictToJsonFile(configContent, pathConfigJson)  
	#getting the color information from the data file
	colorCommunityDict = {}
	dataDict = utilsOs.openJsonFileAsDict(u'{0}data.json'.format(pathToTheExportationEnvironment))
	for nodeDict in dataDict[u'nodes']:
		try:
			if nodeDict[u'attributes'][u'community_lvl_0'] not in colorCommunityDict:
				colorCommunityDict[nodeDict[u'attributes'][u'community_lvl_0']] = u'\t\t\t<div style="color: {0};">● {1}</div>\n'.format(nodeDict[u'color'], nodeDict[u'attributes'][u'infered_community_name_lvl_0'])
			'''
			#####################################################
			#before I changed the names of the columns
			if nodeDict[u'attributes'][u'community'] not in colorCommunityDict:
				colorCommunityDict[nodeDict[u'attributes'][u'community']] = u'\t\t\t<div style="color: {0};">● {1}</div>\n'.format(nodeDict[u'color'], nodeDict[u'attributes'][u'infered_community_name'])
			'''
		except KeyError:
			pass
	#modifying the index.html file
	with open(u'{0}index.html'.format(pathToTheExportationEnvironment)) as indexFile:
		fileLines = indexFile.readlines()
		for index, line in enumerate(fileLines):
			if line == u'\t\t<dt class="colours"></dt>\n':
				indexDivisor = index + 1
				break
		fileLines = fileLines[:indexDivisor] + [u'\t\t<dd>\n'] + list(colorCommunityDict.values()) + [u'\t\t</dd>\n'] + fileLines[indexDivisor+1:]
	utilsOs.dumpRawLines(fileLines, u'{0}index.html'.format(pathToTheExportationEnvironment), addNewline=False, rewrite=True)
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import sys, time, os, shutil
sys.path.append(u'../utils')
sys.path.append(u'./utils')
import utilsOs, utilsString

outputPath = u'/data/rali5/Tmp/alfonsda/workRali/004tradBureau/008originalDocumentsBt/'
tempPath = u'{0}tmp/'.format(outputPath)
outputDejaVusDocs = u'{0}json/dejaVusDocs.json'.format(outputPath)
outputDejaVusRequestId = u'{0}json/dejaVusRequests.json'.format(outputPath)

if utilsOs.theFileExists(outputDejaVusRequestId) is True:
    dejaVu = utilsOs.openJsonFileAsDict(outputDejaVusRequestId)
else:
    dejaVu = set([])

# count the time the algorithm takes to run
startTime = utilsOs.countTime()

# get url, user and password
with open(u'./b011pathUserPassword') as pup:
    url = pup.readline().replace(u'\n', u'')
    user = pup.readline().replace(u'\n', u'')
    passw = pup.readline().replace(u'\n', u'')

# open the driver go to the page
profile = webdriver.FirefoxProfile()
profile.set_preference("webdriver_assume_untrusted_issuer", False)
예제 #10
0
def reformatFilesPreGiza(pathToEnFile, pathToFrFile, overwrite=True):
    """
    make 2 vocabulary files (occurrence dict) in the format needed by giza++ or mgiza++
    then reformats the corpus into a the format needed by giza++ or mgiza++
    :param pathToEnFile: path to the english sentences file
    :param pathToFrFile: path to the french sentences file
    :return: None
    """
    # prepare the output paths
    outputEnPath = prepareOutPutFile(pathToEnFile, fileName=u"sourceEn.vcb")
    outputFrPath = prepareOutPutFile(pathToFrFile, fileName=u"targetFr.vcb")
    outputPathGizaFormatCorpus = prepareOutPutFile(
        pathToEnFile, fileName=u"sentenceFile.giza")
    outputEnDictPath = prepareOutPutFile(pathToEnFile, fileName=u"en.json")
    outputFrDictPath = prepareOutPutFile(pathToEnFile, fileName=u"fr.json")
    outputSpDictPath = prepareOutPutFile(pathToEnFile, fileName=u"sp.json")
    outputEnIdDictPath = prepareOutPutFile(pathToEnFile, fileName=u"enId.json")
    outputFrIdDictPath = prepareOutPutFile(pathToEnFile, fileName=u"frId.json")
    # if there is not a file there yet, open the corpus Files, count the frequency of each token
    if overwrite is True or os.path.isfile(outputEnDictPath) is False:
        # make the frequency dict
        enTokFreqDict = makeFreqDict(pathToEnFile, lang=u"en")
        frTokFreqDict = makeFreqDict(pathToFrFile, lang=u"fr")
        # open the corpus files count the frequency of the sentence pairs
        spFreqDict = makeSPfreqDict(pathToEnFile, pathToFrFile)
        # sort the dict by freq
        orderedKeysValuesEn = sorted(enTokFreqDict.items(),
                                     key=lambda kv: (kv[1], kv[0]),
                                     reverse=True)
        orderedKeysValuesFr = sorted(frTokFreqDict.items(),
                                     key=lambda kv: (kv[1], kv[0]),
                                     reverse=True)
        # make the id dict
        enIdDict = makeIdDict(orderedKeysValuesEn)
        frIdDict = makeIdDict(orderedKeysValuesFr)
        # dump dicts
        utilsOs.dumpDictToJsonFile(enTokFreqDict, outputEnDictPath, overwrite)
        utilsOs.dumpDictToJsonFile(frTokFreqDict, outputFrDictPath, overwrite)
        utilsOs.dumpDictToJsonFile(spFreqDict, outputSpDictPath, overwrite)
        utilsOs.dumpDictToJsonFile(enIdDict, outputEnIdDictPath, overwrite)
        utilsOs.dumpDictToJsonFile(frIdDict, outputFrIdDictPath, overwrite)
    # if the file already exists or if overwrite is false
    else:
        enTokFreqDict = utilsOs.openJsonFileAsDict(outputEnDictPath)
        frTokFreqDict = utilsOs.openJsonFileAsDict(outputFrDictPath)
        spFreqDict = utilsOs.openJsonFileAsDict(outputSpDictPath)
        enIdDict = utilsOs.openJsonFileAsDict(outputEnIdDictPath)
        frIdDict = utilsOs.openJsonFileAsDict(outputFrIdDictPath)
        # sort the dict by freq
        orderedKeysValuesEn = sorted(enTokFreqDict.items(),
                                     key=lambda kv: (kv[1], kv[0]),
                                     reverse=True)
        orderedKeysValuesFr = sorted(frTokFreqDict.items(),
                                     key=lambda kv: (kv[1], kv[0]),
                                     reverse=True)
    # dump the empty tok voc file
    if overwrite is True:
        firstLine = u"1\tUNK\t0"
        utilsOs.createEmptyFile(outputEnPath, headerLine=firstLine)
        utilsOs.createEmptyFile(outputFrPath, headerLine=firstLine)
        utilsOs.createEmptyFile(outputPathGizaFormatCorpus)
    # dump the dict in the tok voc file
    for indKv, kv in enumerate(orderedKeysValuesEn):
        stringLine = u"{0}\t{1}\t{2}".format(indKv + 2, kv[0], kv[1])
        utilsOs.appendLineToFile(stringLine, outputEnPath, addNewLine=True)
    for indKv, kv in enumerate(orderedKeysValuesFr):
        stringLine = u"{0}\t{1}\t{2}".format(indKv + 2, kv[0], kv[1])
        utilsOs.appendLineToFile(stringLine, outputFrPath, addNewLine=True)
    # transform and dump the corpus into the GIZA format
    appendToDumpInGizaFormat(pathToEnFile, pathToFrFile,
                             outputPathGizaFormatCorpus, enIdDict, frIdDict,
                             spFreqDict)
    return outputEnPath, outputFrPath, outputPathGizaFormatCorpus, outputEnDictPath, outputFrDictPath, outputSpDictPath
def annotateFiles(listOfFilesPath=None,
                  annotatedOutputFolder=u'./002manuallyAnnotated/',
                  dumpSP=True):
    """ given a list of paths, manually show and annotate the sentence pairs """
    referencePathLine = []
    listOfAnnotations = []
    # get the list containing the file paths
    if listOfFilesPath is None:
        listOfFilesPath = randomlySelectNDocsFromPath(
            b000path.getBtFolderPath(flagFolder=None), n=100)
        makeLocalFolderPaths(listOfFilesPath)
    elif type(listOfFilesPath) is str:
        if u'.json' in listOfFilesPath:
            listOfFilesPath = utilsOs.openJsonFileAsDict(listOfFilesPath)
        else:
            listOfFilesPath = [listOfFilesPath]
    # get rid of the files we have already annotated
    if utilsOs.theFileExists(
            u'{0}sampleReference.tsv'.format(annotatedOutputFolder)):
        refLines = utilsOs.readAllLinesFromFile(
            u'{0}sampleReference.tsv'.format(annotatedOutputFolder),
            noNewLineChar=True)
        annotatedFiles = set([line.split(u'\t')[0] for line in refLines])
        listOfFilesPath = [
            file for file in listOfFilesPath if file not in annotatedFiles
        ]
    # print the annotator cheat sheet
    print(""""0 - badly aligned
        \n\t0.0 - AMPLIFICATION: compensation, description, repetition or lang tendency to hypergraphy
        \n\t0.1 - ELISION: absence, omission, reduction or lang tendency to micrography
        \n\t0.2 - DISPLACEMENT: modification of the line order also modifying the order of the following lines
        \n\t0.3 - MISALIGNED and FOIBLE: alignment and quality errors
        \n1 - well aligned
        \n\t1.0 - ALIGNED and GOOD QUALITY: is aligned and shows no evident sing of translation imperfections 
        \n\t1.1 - FOIBLE: imperfection in the translation quality""")
    # open each file in EN and FR and show it in the terminal
    for filePath in listOfFilesPath:
        print(u'############# {0} ##############'.format(
            filePath.replace(
                u'/data/rali8/Tmp/rali/bt/burtrad/corpus_renamed/', u'')))
        # get the path for the source and target
        fileSourcePath = u'{0}.fr'.format(
            filePath) if u'fr-en' in filePath else u'{0}.en'.format(filePath)
        fileTargetPath = u'{0}.en'.format(
            filePath) if u'fr-en' in filePath else u'{0}.fr'.format(filePath)
        with open(fileSourcePath) as fileSource:
            with open(fileTargetPath) as fileTarget:
                # show the context of the annotated sentence
                beforeSentSource = fileSource.readline()
                duringSentSource = fileSource.readline()
                beforeSentTarget = fileTarget.readline()
                duringSentTarget = fileTarget.readline()
                # annotate the first sentence pair
                listOfAnnotations = annotateFirstSP(beforeSentSource,
                                                    duringSentSource,
                                                    beforeSentTarget,
                                                    duringSentTarget,
                                                    listOfAnnotations,
                                                    lineLength=137)
                # save the reference
                # if the filepath is the reference
                if u'burtrad' in filePath:
                    referencePathLine.append(u'{0}\t{1}'.format(filePath, 0))
                # otherwise we get it from a reference file
                else:
                    with open(u'{0}.tsv'.format(filePath)) as refFile:
                        refLns = [
                            ln.replace(u'\n', u'')
                            for ln in refFile.readlines()
                        ]
                    referencePathLine.append(refLns[0])
                # dump the first SP
                if dumpSP is True:
                    enSent = beforeSentSource if u'.en' in fileSourcePath else beforeSentTarget
                    frSent = beforeSentTarget if u'.en' in fileSourcePath else beforeSentSource
                    utilsOs.appendLineToFile(
                        enSent,
                        u'{0}sample.en'.format(annotatedOutputFolder),
                        addNewLine=False)
                    utilsOs.appendLineToFile(
                        frSent,
                        u'{0}sample.fr'.format(annotatedOutputFolder),
                        addNewLine=False)
                duringIndex = 1
                # for each line
                while duringSentSource or duringSentTarget:
                    # get the correct terminal line length
                    lineLength = 137 - len(str(len(listOfAnnotations) + 1))
                    # get the sentences
                    afterSentSource = fileSource.readline()
                    afterSentTarget = fileTarget.readline()
                    # color in red the during lines
                    redDuringSource = u'\033[1;31m{0}\033[0m'.format(
                        duringSentSource)
                    redDuringTarget = u'\033[1;31m{0}\033[0m'.format(
                        duringSentTarget)
                    # print the sentences
                    print(u'{0} - {1}'.format(
                        len(listOfAnnotations) - 1, beforeSentSource))
                    print(u'{0} - {1}'.format(
                        len(listOfAnnotations) - 1, beforeSentTarget))
                    print(u'{0} - {1}'.format(len(listOfAnnotations),
                                              redDuringSource))
                    print(u'{0} - {1}'.format(len(listOfAnnotations),
                                              redDuringTarget))
                    print(u'{0} - {1}'.format(
                        len(listOfAnnotations) + 1, afterSentSource))
                    print(u'{0} - {1}'.format(
                        len(listOfAnnotations) + 1, afterSentTarget))
                    print()
                    # count if the lines that take the space of 2 lines
                    longLines = getNbLongLines([
                        beforeSentSource, beforeSentTarget, duringSentSource,
                        duringSentTarget, afterSentSource, afterSentTarget
                    ], lineLength)
                    # get the first part of the annotation (aligned or not)
                    annotatorGeneralInput = input(
                        u'Aligned-Misaligned annotation: ')
                    # make sure to have the right general annotation
                    while True:
                        if annotatorGeneralInput in [
                                u'0', u'1', u'0.0', u'0.1', u'0.2', u'0.3',
                                u'1.0', u'1.1', u'c', u'correct'
                        ]:
                            break
                        else:
                            utilsOs.moveUpAndLeftNLines(1, slowly=False)
                            annotatorGeneralInput = input(
                                u'Repeat annotation: ')
                    if annotatorGeneralInput in [u'c', u'correct']:
                        annotatorGeneralInput, listOfAnnotations = correctionToAnnotation(
                            listOfAnnotations)
                    # if we still need to specify what type of alignment or misalignment
                    if annotatorGeneralInput in [u'0', u'1']:
                        utilsOs.moveUpAndLeftNLines(1, slowly=False)
                        # get the second part of the annotation (aligned or not)
                        annotatorSpecificInput = input(
                            u'Specific type annotation: ')
                        typeAnswers = [
                            u'0', u'1', u'2', u'3'
                        ] if annotatorGeneralInput == 0 else [u'0', u'1']
                        # make sure to have the right specific annotation
                        while True:
                            if annotatorSpecificInput in typeAnswers:
                                break
                            else:
                                utilsOs.moveUpAndLeftNLines(1, slowly=False)
                                annotatorSpecificInput = input(
                                    u'Repeat type annotation: ')
                        # save to the list of annotations
                        listOfAnnotations.append(
                            float(u'{0}.{1}'.format(annotatorGeneralInput,
                                                    annotatorSpecificInput)))
                    # if the right answer was given in the right format right away
                    else:
                        # save to the list of annotations
                        listOfAnnotations.append(float(annotatorGeneralInput))
                    # remove the lines from the terminal before getting to the next pair
                    utilsOs.moveUpAndLeftNLines(14 + longLines, slowly=False)
                    # erase all remainder of the previous sentences and go back up again
                    for e in range(14 + longLines):
                        print(u' ' * (lineLength + 4))
                    utilsOs.moveUpAndLeftNLines(14 + longLines, slowly=False)
                    # next line source
                    beforeSentSource = duringSentSource
                    duringSentSource = afterSentSource
                    # next line target
                    beforeSentTarget = duringSentTarget
                    duringSentTarget = afterSentTarget
                    # append the reference to the file
                    # if the filepath is the reference
                    if u'burtrad' in filePath:
                        referencePathLine.append(u'{0}\t{1}'.format(
                            filePath, duringIndex))
                    # otherwise we get it from a reference file
                    else:
                        with open(u'{0}.tsv'.format(filePath)) as refFile:
                            refLns = [
                                ln.replace(u'\n', u'')
                                for ln in refFile.readlines()
                            ]
                        referencePathLine.append(refLns[duringIndex])
                    # add 1 to index
                    duringIndex += 1
                    # dump the file line by line, to be sure in case of error
                    # dump the reference
                    utilsOs.dumpRawLines(referencePathLine,
                                         u'{0}sampleReference.tsv'.format(
                                             annotatedOutputFolder),
                                         addNewline=True,
                                         rewrite=True)
                    # dump the annotation
                    utilsOs.dumpRawLines(listOfAnnotations,
                                         u'{0}sampleAnnotation.tsv'.format(
                                             annotatedOutputFolder),
                                         addNewline=True,
                                         rewrite=True)
                    # dump the SP
                    if dumpSP is True:
                        enSent = beforeSentSource if u'.en' in fileSourcePath else beforeSentTarget
                        frSent = beforeSentTarget if u'.en' in fileSourcePath else beforeSentSource
                        utilsOs.appendLineToFile(
                            enSent,
                            u'{0}sample.en'.format(annotatedOutputFolder),
                            addNewLine=False)
                        utilsOs.appendLineToFile(
                            frSent,
                            u'{0}sample.fr'.format(annotatedOutputFolder),
                            addNewLine=False)
        # clear part of terminal
        utilsOs.moveUpAndLeftNLines(2, slowly=False)
예제 #12
0
def englishOrFrench(string):
    '''guesses the language of a string between english and french'''
    import utilsOs
    from langdetect.lang_detect_exception import LangDetectException
    #if the string is only made of numbers and non alphabetic characters we return 'unknown'
    if re.fullmatch(
            re.compile(
                r'([0-9]|-|\+|\!|\#|\$|%|&|\'|\*|\?|\.|\^|_|`|\||~|:|@)+'),
            string) != None:
        return u'unknown'
    #if more than 30% of the string characters is outside the ascii block and the french block, then it must be another language and we return 'unknown'
    if unicodeCodeScore(
            string, countSpaces=False, unicodeBlocksList=[[0, 255]]) < 0.7:
        return u'unknown'
    #if the string has a presence of unicode characters of french specific diacritics
    diacritics = [
        192, 194, [199, 203], 206, 207, 212, 140, 217, 219, 220, 159, 224, 226,
        [231, 235], 238, 239, 244, 156, 250, 251, 252, 255
    ]
    if unicodeCodeScore(
            string, countSpaces=False, unicodeBlocksList=diacritics) > 0.0:
        return u'fr'
    #putting the string in lowercase improves the language detection functions
    string = string.lower()
    #use langdetect except if it returns something else than "en" or "fr", if the string is too short it's easy to mistake the string for another language
    try:
        lang = detect(string)
        if lang in [u'en', u'fr']:
            return lang
    #if there is an encoding or character induced error, we try the alternative language detection
    except LangDetectException:
        pass
    #alternative language detection
    #token detection
    unkTokendict = tokenDictMaker(string)
    #ngram char detection
    unkNgramDict = trigramDictMaker(
        string.replace(u'\n', u' ').replace(u'\r', u''))
    #if the obtained dict is empty, unable to detect (probably just noise)
    if len(unkTokendict) == 0 or len(unkNgramDict) == 0:
        return u'unknown'
    #token scores
    frenchTokScore = langDictComparison(
        unkTokendict,
        utilsOs.openJsonFileAsDict(u'./utilsString/tokDict/frTok.json'))
    englishTokScore = langDictComparison(
        unkTokendict,
        utilsOs.openJsonFileAsDict(u'./utilsString/tokDict/enTok.json'))
    #ngram scores
    frenchNgramScore = langDictComparison(
        unkNgramDict,
        utilsOs.openJsonFileAsDict(u'./utilsString/charDict/frChar3gram.json'))
    englishNgramScore = langDictComparison(
        unkNgramDict,
        utilsOs.openJsonFileAsDict(u'./utilsString/charDict/enChar3gram.json'))
    #the smaller the string (in tokens), the more we want to prioritize the token score instead of the ngram score
    if len(unkTokendict) < 5:
        ratioNgram = float(len(unkTokendict)) / 10.0
        frenchTokScore = frenchTokScore * (1.0 - ratioNgram)
        frenchNgramScore = frenchNgramScore * ratioNgram
        englishTokScore = englishTokScore * (1.0 - ratioNgram)
        englishNgramScore = englishNgramScore * ratioNgram
    #we compare the sum of the language scores
    if (frenchTokScore + frenchNgramScore) < (englishTokScore +
                                              englishNgramScore):
        return u'fr'
    return u'en'