예제 #1
0
	def getJobsZackExtracted(self, jobFilePath, outputPath=None, to_remove=to_remove, pattrn=pattrn):
		'''
		extracts the most best jobs according to their co-reference 
		in decreasing order of ngram: 
			chief executive officer 
				IS BETTER THAN 
			chief officer 
				IS BETTER THAN 
			officer
		'''
		setOfJobs = set()
		#count jobs co-reference (counting the (1-4)-gram token words in the job title)
		ngram_counts = self.getNgram_counts(jobFilePath, to_remove, pattrn)
		#get best possibility
		with codecs.open(jobFilePath, 'r', encoding='utf8') as openedFile:
			for index, jobTitle in enumerate(openedFile):
				#ORIGINAL extractor bestOption = self.get_best(jobTitle, to_remove, pattrn, ngram_counts)
				bestOption = self.get_best_modified(jobTitle, to_remove, pattrn, ngram_counts) 
				#add the 'best' job name to the final set
				if bestOption != "<unk>":
					setOfJobs.add(bestOption)
		#dump the output if the output path is specified
		if outputPath != None:
			utilsOs.dumpRawLines(setOfJobs, outputPath, addNewline=True, rewrite=True)
		return setOfJobs
def saveNotFlaggedList():
    # save the files path list in an external file
    filePathList = getFilePathsLists([u'NOT-FLAGGED'])
    filePathList = [b000path.anonymizePath(p) for p in filePathList]
    utilsOs.dumpRawLines(
        filePathList,
        u'/data/rali5/Tmp/alfonsda/workRali/004tradBureau/006appliedHeuristics/NOT-FLAGGED/files.paths'
    )
    return None
def mergeAnnotatedFiles(pathToPrimary, pathOrListOfPathsToSecondary):
    # get the path to the primary folder
    def dividePaths(pathAnnotFile):
        if u'sampleAnnotation.tsv' in pathAnnotFile:
            pathFolder = pathAnnotFile.replace(u'sampleAnnotation.tsv', u'')
        else:
            pathFolder = pathAnnotFile
            pathAnnotFile = u'{0}sampleAnnotation.tsv'.format(pathAnnotFile)
        return pathAnnotFile, pathFolder

    pathToPrimary, primaryFolder = dividePaths(pathToPrimary)
    # make secondary a list if it is string
    if type(pathOrListOfPathsToSecondary) is str:
        pathOrListOfPathsToSecondary = [pathOrListOfPathsToSecondary]
    # open primary
    primaryRefPath = u'{0}sampleReference.tsv'.format(primaryFolder)
    primaryAnnotDf, primaryRefDf = utilsOs.getDataFrameFromArgs(pathToPrimary,
                                                                primaryRefPath,
                                                                header=False)
    primaryEnPath = u'{0}sample.en'.format(primaryFolder)
    primaryFrPath = u'{0}sample.fr'.format(primaryFolder)
    primaryEnDf, primaryFrDf = utilsOs.getDataFrameFromArgs(primaryEnPath,
                                                            primaryFrPath,
                                                            header=False)
    # open the secondaries and merge
    for secondaryPath in pathOrListOfPathsToSecondary:
        pathToSec, secFolder = dividePaths(secondaryPath)
        # open secondary dataframe
        secAnnotDf, secRefDf = utilsOs.getDataFrameFromArgs(
            pathToSec,
            u'{0}sampleReference.tsv'.format(secFolder),
            header=False)
        secEnDf, secFrDf = utilsOs.getDataFrameFromArgs(
            u'{0}sample.en'.format(secFolder),
            u'{0}sample.fr'.format(secFolder),
            header=False)
        # concatenate the primary with the secondary
        primaryAnnotDf = utilsOs.concatenateDfsOrSeries(
            [primaryAnnotDf, secAnnotDf])
        primaryRefDf = utilsOs.concatenateDfsOrSeries([primaryRefDf, secRefDf])
        primaryEnDf = utilsOs.concatenateDfsOrSeries([primaryEnDf, secEnDf])
        primaryFrDf = utilsOs.concatenateDfsOrSeries([primaryFrDf, secFrDf])
    # dump in the primary's path
    utilsOs.dumpDataFrame(primaryAnnotDf, pathToPrimary, header=False)
    utilsOs.dumpDataFrame(primaryRefDf, primaryRefPath, header=False)
    utilsOs.dumpDataFrame(primaryEnDf, primaryEnPath, header=False)
    utilsOs.dumpDataFrame(primaryFrDf, primaryFrPath, header=False)
    # bug fix to avoid the 1.0 and 0.0 transforming into 1 and 0
    with open(pathToPrimary) as annotFile:
        annotLines = annotFile.readlines()
        for aIndex, aLine in enumerate(annotLines):
            if u'1\n' == aLine:
                annotLines[aIndex] = aLine.replace(u'1\n', u'1.0\n')
            elif u'0\n' == aLine:
                annotLines[aIndex] = aLine.replace(u'0\n', u'0.0\n')
    utilsOs.dumpRawLines(annotLines, pathToPrimary, addNewline=False)
def randomlyExtractAndDump(extractedSp, extractionSize, subsetName):
    """ given a dict with all the heuristically extracted """
    outputDict = {
        0:
        u'./003negativeNaiveExtractors/numberCoincidence/random100Nb{0}.tsv'.
        format(subsetName),
        1:
        u'./003negativeNaiveExtractors/fewTokens/random100few{0}.tsv'.format(
            subsetName),
        2:
        u'./003negativeNaiveExtractors/cognates/random100cog{0}.tsv'.format(
            subsetName)
    }
    for extrType, fileDict in extractedSp.items():
        # maintain a census of which index we have already used
        dejaVu = []
        # count the total lines
        print(u"-  EXTRACTION TYPE : ", extrType, u'NUMBER OF FILES : ',
              len(fileDict))
        nbLines = 0
        for path, lineList in fileDict.items():
            nbLines += len(lineList)
        print(u'\tNUMBER OF EXTRACTED LINES : ', nbLines)
        dictPaths = list(fileDict.keys())
        # we stop if we achieve our limit
        while len(dejaVu) < extractionSize:
            # get the file path index if it's empty then abort
            rdmFileIndex = getRandomIndex(dictPaths)
            if rdmFileIndex is None:
                break
            # get the list of the lines
            lineList = fileDict[dictPaths[rdmFileIndex]]
            rdmLineIndex = getRandomIndex(lineList)
            # if it's empty, abort
            if rdmLineIndex is None:
                break
            # otherwise
            while u'{0}\t{1}'.format(dictPaths[rdmFileIndex],
                                     rdmLineIndex) in dejaVu:
                rdmFileIndex = getRandomIndex(dictPaths)
                lineList = fileDict[dictPaths[rdmFileIndex]]
                rdmLineIndex = getRandomIndex(lineList)
            # add to the deja vu
            dejaVu.append(u'{0}\t{1}'.format(dictPaths[rdmFileIndex],
                                             lineList[rdmLineIndex]))
        # dump
        utilsOs.dumpRawLines(dejaVu,
                             outputDict[extrType],
                             addNewline=True,
                             rewrite=True)
        dumpReferenceToLangFiles(dejaVu, outputDict[extrType])
    return dejaVu
예제 #5
0
def getANewSpWhereWeLeftOff(refPath=u"/data/rali5/Tmp/alfonsda/workRali/004tradBureau/017deepLTranslatedCorpus/trRef"):
    # check if the ref file already exists
    if utilsOs.theFileExists(refPath) is False:
        utilsOs.createEmptyFile(refPath)
    # open the reference file
    lastSeenIndex, lastSeenPath = None, None
    with open(refPath) as ref:
        # first line
        refLns = ref.readlines()
        refIndex = 0
        for refLn in refLns:
            refList = refLn.replace(u"\n", u"").split(u"\t")
            # test if we have an index for the path
            try:
                lastSeenIndex = int(refList[1])
                lastSeenPath = refList[0]
                break
            # if there is no integral, then it saw all lns for that path
            except ValueError:
                pass
            # next ref index
            refIndex += 1
    # open the last seen file at the (last seen index + 1) and return the sp in the en and fr files
    if lastSeenIndex is None:
        return None
    with open(u"{0}.en".format(lastSeenPath)) as enFile:
        with open(u"{0}.fr".format(lastSeenPath)) as frFile:
            enLn = enFile.readline()
            frLn = frFile.readline()
            indexLn = 0
            while enLn:
                if indexLn == lastSeenIndex+1:
                    # replace the line with its next index and dump the ref file
                    refLns[refIndex] = u"{0}\t{1}\n".format(lastSeenPath, indexLn)
                    # return the sentence pair
                    return [enLn.replace(u"\n", u""), frLn.replace(u"\n", u"")], lastSeenPath, indexLn, refLns
                # next line
                enLn = enFile.readline()
                frLn = frFile.readline()
                indexLn += 1
    # if we went over the whole document and it ended, change the ref line, dump it and start over
    refLns[refIndex] = u"{0}\tdone\n".format(lastSeenPath)
    utilsOs.dumpRawLines(refLns, refPath, addNewline=False, rewrite=True)
    return getANewSpWhereWeLeftOff(refPath)
예제 #6
0
	def getReliableJobTitles(self, jobAndPitchFilePath, lang=u'en', outputPath=None, includeJobsWithNApitch=True):
		'''
		Make a set containing the job titles that might be considered more reliable:
		filter 1:
			- having less than 3 tokens
			- not having ampersand (&) or slash (/) signs
			- not having acronyms
		filter 2:
			- present more than once (no hapax)
			- being the right language (en/fr in both 'job' and 'pitch')
		'''
		with codecs.open(jobAndPitchFilePath, 'r', encoding='utf8') as openedFile:
			candidatesDict = self.reliableFilter1(openedFile)
		setOfReliableJobs = self.reliableFilter2(candidatesDict, lang, includeJobsWithNApitch)

		#dump the output if the output path is specified
		if outputPath != None:
			utilsOs.dumpRawLines(setOfReliableJobs, outputPath, addNewline=True, rewrite=True)
		return setOfReliableJobs
def repairHeuristicsScore(heuristicName,
                          corpus=[
                              u'ALIGNMENT-QUALITY', u'MISALIGNED', u'QUALITY',
                              u'NOT-FLAGGED'
                          ]):
    """ rewrite the score in order to correct some problems
    u'nb', u'cog', u'len', u'fa', u'ion', u'sw', u'spell', u'url', u'mono', u'tabl', 'strBcks', 'punct', 'gibb' """
    basePath = u'/data/rali5/Tmp/alfonsda/workRali/004tradBureau/006appliedHeuristics/'
    for name in corpus:
        scorePath = u'{0}{1}/{2}/score.tsv'.format(basePath, name,
                                                   heuristicName)
        with open(scorePath) as scoreFile:
            scoreLines = scoreFile.readlines()
        # line by line
        for lnIndex, scoreLn in enumerate(scoreLines):
            scoreList = scoreLn.replace(u'\n', u'').split(u'\t')
            if scoreList[0] != u'na':
                # change depending on heuristic
                if heuristicName in [u'url']:
                    smallest = min([int(scoreList[3]), int(scoreList[4])])
                    greatest = max([int(scoreList[3]), int(scoreList[4])])
                    if int(scoreList[3]) + int(scoreList[4]) != 0:
                        scoreList[0] = str(float(smallest) / float(greatest))
                        scoreLines[lnIndex] = u'{0}\n'.format(
                            u'\t'.join(scoreList))
                elif heuristicName in [u'mono']:
                    smallest = min([int(scoreList[1]), int(scoreList[2])])
                    greatest = max([int(scoreList[1]), int(scoreList[2])])
                    if int(scoreList[1]) + int(scoreList[2]) != 0:
                        scoreList[0] = str(float(smallest) / float(greatest))
                        scoreLines[lnIndex] = u'{0}\n'.format(
                            u'\t'.join(scoreList))
                elif heuristicName in [u'ion']:
                    if int(scoreList[1]) + int(scoreList[2]) <= 2:
                        scoreList[0] = u'na'
                        scoreLines[lnIndex] = u'{0}\n'.format(
                            u'\t'.join(scoreList))
        utilsOs.dumpRawLines(scoreLines,
                             scorePath,
                             addNewline=False,
                             rewrite=True)
예제 #8
0
def modifyConfigAndIndexFiles(pathToTheExportationEnvironment):
	'''
	given the path to the sigma.js exportation environment (ending in 
	the folder "network/"), it changes the config.json file and the index.html
	file so they show the graph the way intended
	'''
	#copying config.json file
	configContent = {"type": "network","version": "1.0","data": "data.json","logo": {"file": "","link": "","text": ""},"text": {"more": "","intro": "","title": ""},"legend": {"edgeLabel": "","colorLabel": "","nodeLabel": ""},"features": {"search": True,"groupSelectorAttribute": True,"hoverBehavior": "default"},"informationPanel": {"groupByEdgeDirection": True,"imageAttribute": False},"sigma": {"drawingProperties": {"defaultEdgeType": "curve","defaultHoverLabelBGColor": "#002147","defaultLabelBGColor": "#ddd","activeFontStyle": "bold","defaultLabelColor": "#000","labelThreshold": 999,"defaultLabelHoverColor": "#fff","fontStyle": "bold","hoverFontStyle": "bold","defaultLabelSize": 14},"graphProperties": {"maxEdgeSize": 2,"minEdgeSize": 2,"minNodeSize": 0.25,"maxNodeSize": 2.5},"mouseProperties": {"maxRatio": 20,"minRatio": 0.75}}}
	pathConfigJson = u'{0}config.json'.format(pathToTheExportationEnvironment)
	if utilsOs.theFileExists(pathConfigJson) == True:
		os.remove(pathConfigJson)
	utilsOs.dumpDictToJsonFile(configContent, pathConfigJson)  
	#getting the color information from the data file
	colorCommunityDict = {}
	dataDict = utilsOs.openJsonFileAsDict(u'{0}data.json'.format(pathToTheExportationEnvironment))
	for nodeDict in dataDict[u'nodes']:
		try:
			if nodeDict[u'attributes'][u'community_lvl_0'] not in colorCommunityDict:
				colorCommunityDict[nodeDict[u'attributes'][u'community_lvl_0']] = u'\t\t\t<div style="color: {0};">● {1}</div>\n'.format(nodeDict[u'color'], nodeDict[u'attributes'][u'infered_community_name_lvl_0'])
			'''
			#####################################################
			#before I changed the names of the columns
			if nodeDict[u'attributes'][u'community'] not in colorCommunityDict:
				colorCommunityDict[nodeDict[u'attributes'][u'community']] = u'\t\t\t<div style="color: {0};">● {1}</div>\n'.format(nodeDict[u'color'], nodeDict[u'attributes'][u'infered_community_name'])
			'''
		except KeyError:
			pass
	#modifying the index.html file
	with open(u'{0}index.html'.format(pathToTheExportationEnvironment)) as indexFile:
		fileLines = indexFile.readlines()
		for index, line in enumerate(fileLines):
			if line == u'\t\t<dt class="colours"></dt>\n':
				indexDivisor = index + 1
				break
		fileLines = fileLines[:indexDivisor] + [u'\t\t<dd>\n'] + list(colorCommunityDict.values()) + [u'\t\t</dd>\n'] + fileLines[indexDivisor+1:]
	utilsOs.dumpRawLines(fileLines, u'{0}index.html'.format(pathToTheExportationEnvironment), addNewline=False, rewrite=True)
def annotateFiles(listOfFilesPath=None,
                  annotatedOutputFolder=u'./002manuallyAnnotated/',
                  dumpSP=True):
    """ given a list of paths, manually show and annotate the sentence pairs """
    referencePathLine = []
    listOfAnnotations = []
    # get the list containing the file paths
    if listOfFilesPath is None:
        listOfFilesPath = randomlySelectNDocsFromPath(
            b000path.getBtFolderPath(flagFolder=None), n=100)
        makeLocalFolderPaths(listOfFilesPath)
    elif type(listOfFilesPath) is str:
        if u'.json' in listOfFilesPath:
            listOfFilesPath = utilsOs.openJsonFileAsDict(listOfFilesPath)
        else:
            listOfFilesPath = [listOfFilesPath]
    # get rid of the files we have already annotated
    if utilsOs.theFileExists(
            u'{0}sampleReference.tsv'.format(annotatedOutputFolder)):
        refLines = utilsOs.readAllLinesFromFile(
            u'{0}sampleReference.tsv'.format(annotatedOutputFolder),
            noNewLineChar=True)
        annotatedFiles = set([line.split(u'\t')[0] for line in refLines])
        listOfFilesPath = [
            file for file in listOfFilesPath if file not in annotatedFiles
        ]
    # print the annotator cheat sheet
    print(""""0 - badly aligned
        \n\t0.0 - AMPLIFICATION: compensation, description, repetition or lang tendency to hypergraphy
        \n\t0.1 - ELISION: absence, omission, reduction or lang tendency to micrography
        \n\t0.2 - DISPLACEMENT: modification of the line order also modifying the order of the following lines
        \n\t0.3 - MISALIGNED and FOIBLE: alignment and quality errors
        \n1 - well aligned
        \n\t1.0 - ALIGNED and GOOD QUALITY: is aligned and shows no evident sing of translation imperfections 
        \n\t1.1 - FOIBLE: imperfection in the translation quality""")
    # open each file in EN and FR and show it in the terminal
    for filePath in listOfFilesPath:
        print(u'############# {0} ##############'.format(
            filePath.replace(
                u'/data/rali8/Tmp/rali/bt/burtrad/corpus_renamed/', u'')))
        # get the path for the source and target
        fileSourcePath = u'{0}.fr'.format(
            filePath) if u'fr-en' in filePath else u'{0}.en'.format(filePath)
        fileTargetPath = u'{0}.en'.format(
            filePath) if u'fr-en' in filePath else u'{0}.fr'.format(filePath)
        with open(fileSourcePath) as fileSource:
            with open(fileTargetPath) as fileTarget:
                # show the context of the annotated sentence
                beforeSentSource = fileSource.readline()
                duringSentSource = fileSource.readline()
                beforeSentTarget = fileTarget.readline()
                duringSentTarget = fileTarget.readline()
                # annotate the first sentence pair
                listOfAnnotations = annotateFirstSP(beforeSentSource,
                                                    duringSentSource,
                                                    beforeSentTarget,
                                                    duringSentTarget,
                                                    listOfAnnotations,
                                                    lineLength=137)
                # save the reference
                # if the filepath is the reference
                if u'burtrad' in filePath:
                    referencePathLine.append(u'{0}\t{1}'.format(filePath, 0))
                # otherwise we get it from a reference file
                else:
                    with open(u'{0}.tsv'.format(filePath)) as refFile:
                        refLns = [
                            ln.replace(u'\n', u'')
                            for ln in refFile.readlines()
                        ]
                    referencePathLine.append(refLns[0])
                # dump the first SP
                if dumpSP is True:
                    enSent = beforeSentSource if u'.en' in fileSourcePath else beforeSentTarget
                    frSent = beforeSentTarget if u'.en' in fileSourcePath else beforeSentSource
                    utilsOs.appendLineToFile(
                        enSent,
                        u'{0}sample.en'.format(annotatedOutputFolder),
                        addNewLine=False)
                    utilsOs.appendLineToFile(
                        frSent,
                        u'{0}sample.fr'.format(annotatedOutputFolder),
                        addNewLine=False)
                duringIndex = 1
                # for each line
                while duringSentSource or duringSentTarget:
                    # get the correct terminal line length
                    lineLength = 137 - len(str(len(listOfAnnotations) + 1))
                    # get the sentences
                    afterSentSource = fileSource.readline()
                    afterSentTarget = fileTarget.readline()
                    # color in red the during lines
                    redDuringSource = u'\033[1;31m{0}\033[0m'.format(
                        duringSentSource)
                    redDuringTarget = u'\033[1;31m{0}\033[0m'.format(
                        duringSentTarget)
                    # print the sentences
                    print(u'{0} - {1}'.format(
                        len(listOfAnnotations) - 1, beforeSentSource))
                    print(u'{0} - {1}'.format(
                        len(listOfAnnotations) - 1, beforeSentTarget))
                    print(u'{0} - {1}'.format(len(listOfAnnotations),
                                              redDuringSource))
                    print(u'{0} - {1}'.format(len(listOfAnnotations),
                                              redDuringTarget))
                    print(u'{0} - {1}'.format(
                        len(listOfAnnotations) + 1, afterSentSource))
                    print(u'{0} - {1}'.format(
                        len(listOfAnnotations) + 1, afterSentTarget))
                    print()
                    # count if the lines that take the space of 2 lines
                    longLines = getNbLongLines([
                        beforeSentSource, beforeSentTarget, duringSentSource,
                        duringSentTarget, afterSentSource, afterSentTarget
                    ], lineLength)
                    # get the first part of the annotation (aligned or not)
                    annotatorGeneralInput = input(
                        u'Aligned-Misaligned annotation: ')
                    # make sure to have the right general annotation
                    while True:
                        if annotatorGeneralInput in [
                                u'0', u'1', u'0.0', u'0.1', u'0.2', u'0.3',
                                u'1.0', u'1.1', u'c', u'correct'
                        ]:
                            break
                        else:
                            utilsOs.moveUpAndLeftNLines(1, slowly=False)
                            annotatorGeneralInput = input(
                                u'Repeat annotation: ')
                    if annotatorGeneralInput in [u'c', u'correct']:
                        annotatorGeneralInput, listOfAnnotations = correctionToAnnotation(
                            listOfAnnotations)
                    # if we still need to specify what type of alignment or misalignment
                    if annotatorGeneralInput in [u'0', u'1']:
                        utilsOs.moveUpAndLeftNLines(1, slowly=False)
                        # get the second part of the annotation (aligned or not)
                        annotatorSpecificInput = input(
                            u'Specific type annotation: ')
                        typeAnswers = [
                            u'0', u'1', u'2', u'3'
                        ] if annotatorGeneralInput == 0 else [u'0', u'1']
                        # make sure to have the right specific annotation
                        while True:
                            if annotatorSpecificInput in typeAnswers:
                                break
                            else:
                                utilsOs.moveUpAndLeftNLines(1, slowly=False)
                                annotatorSpecificInput = input(
                                    u'Repeat type annotation: ')
                        # save to the list of annotations
                        listOfAnnotations.append(
                            float(u'{0}.{1}'.format(annotatorGeneralInput,
                                                    annotatorSpecificInput)))
                    # if the right answer was given in the right format right away
                    else:
                        # save to the list of annotations
                        listOfAnnotations.append(float(annotatorGeneralInput))
                    # remove the lines from the terminal before getting to the next pair
                    utilsOs.moveUpAndLeftNLines(14 + longLines, slowly=False)
                    # erase all remainder of the previous sentences and go back up again
                    for e in range(14 + longLines):
                        print(u' ' * (lineLength + 4))
                    utilsOs.moveUpAndLeftNLines(14 + longLines, slowly=False)
                    # next line source
                    beforeSentSource = duringSentSource
                    duringSentSource = afterSentSource
                    # next line target
                    beforeSentTarget = duringSentTarget
                    duringSentTarget = afterSentTarget
                    # append the reference to the file
                    # if the filepath is the reference
                    if u'burtrad' in filePath:
                        referencePathLine.append(u'{0}\t{1}'.format(
                            filePath, duringIndex))
                    # otherwise we get it from a reference file
                    else:
                        with open(u'{0}.tsv'.format(filePath)) as refFile:
                            refLns = [
                                ln.replace(u'\n', u'')
                                for ln in refFile.readlines()
                            ]
                        referencePathLine.append(refLns[duringIndex])
                    # add 1 to index
                    duringIndex += 1
                    # dump the file line by line, to be sure in case of error
                    # dump the reference
                    utilsOs.dumpRawLines(referencePathLine,
                                         u'{0}sampleReference.tsv'.format(
                                             annotatedOutputFolder),
                                         addNewline=True,
                                         rewrite=True)
                    # dump the annotation
                    utilsOs.dumpRawLines(listOfAnnotations,
                                         u'{0}sampleAnnotation.tsv'.format(
                                             annotatedOutputFolder),
                                         addNewline=True,
                                         rewrite=True)
                    # dump the SP
                    if dumpSP is True:
                        enSent = beforeSentSource if u'.en' in fileSourcePath else beforeSentTarget
                        frSent = beforeSentTarget if u'.en' in fileSourcePath else beforeSentSource
                        utilsOs.appendLineToFile(
                            enSent,
                            u'{0}sample.en'.format(annotatedOutputFolder),
                            addNewLine=False)
                        utilsOs.appendLineToFile(
                            frSent,
                            u'{0}sample.fr'.format(annotatedOutputFolder),
                            addNewLine=False)
        # clear part of terminal
        utilsOs.moveUpAndLeftNLines(2, slowly=False)
예제 #10
0
def launchForOneDay(tokLimit=4000,
                    outputFolderPath=u"/data/rali5/Tmp/alfonsda/workRali/004tradBureau/017deepLTranslatedCorpus/",
                    coffeeBreak=1650):
    """
    launches the deepL bot for one day's worth
    :param tokLimit: maximum number of tokens to treat in the day
    :param outputFolderPath: path to the folder where will be output the files

    :param coffeeBreak: time in seconds when to take a break and start a new deppL session
    :return: tokCount: number of total tokens translated
    """
    start = utilsOs.countTime()
    # path to the referencer, indicating where we left off: path and last index worked
    referencerPath = u"/data/rali5/Tmp/alfonsda/workRali/004tradBureau/017deepLTranslatedCorpus/trRef"
    # info
    deepLUrl = u"https://www.deepl.com/translator"
    mUser, mPass, sUser, sPass = b000path.getDeepLProfileInfo()
    # for each user
    for user, passw in zip([sUser, mUser], [sPass, mPass]):
        tokCount = 0
        # open the driver
        session = webdriver.Firefox()
        session.get(deepLUrl)
        time.sleep(random.uniform(1.3, 3.1))
        # log to deepL
        session = authentificateBtUseSelenium(user, passw, session)
        # while we have not gone over the daily limit
        iterCount = 0
        while tokCount < (tokLimit-10):
            # get the sp
            sp, filePath, fileIndex, refLns = getANewSpWhereWeLeftOff(referencerPath)
            session, nbOfTok, enFrTranslAndAlt, frEnTranslAndAlt, timeEn, timeFr = translateSpGetResult(session, sp)
            # dump the referencer lines
            utilsOs.dumpRawLines(refLns, referencerPath, addNewline=False, rewrite=True)
            # dump original sp
            utilsOs.appendLineToFile(sp[0], u"{0}originalSent.en".format(outputFolderPath), addNewLine=True)
            utilsOs.appendLineToFile(sp[1], u"{0}originalSent.fr".format(outputFolderPath), addNewLine=True)
            # dump translation and variants
            utilsOs.appendLineToFile(enFrTranslAndAlt, u"{0}translated.en2fr".format(outputFolderPath), addNewLine=True)
            utilsOs.appendLineToFile(frEnTranslAndAlt, u"{0}translated.fr2en".format(outputFolderPath), addNewLine=True)
            # dump reference
            utilsOs.appendLineToFile(u"{0}\t{1}\n".format(filePath, fileIndex),
                                     u"{0}reference.tsv".format(outputFolderPath), addNewLine=False)
            # dump timestamp
            utilsOs.appendLineToFile(u"{0}\tlocal time: {1}".format(timeEn, transformTimeToLocalTime(timeEn)),
                                     u"{0}timestamp.en".format(outputFolderPath), addNewLine=True)
            utilsOs.appendLineToFile(u"{0}\tlocal time: {1}".format(timeFr, transformTimeToLocalTime(timeFr)),
                                     u"{0}timestamp.fr".format(outputFolderPath), addNewLine=True)
            # add number of tokens
            tokCount += nbOfTok
            # add nb of iterations
            iterCount += 1
            # take a coffee break if it's time
            if coffeeBreak is not None and utilsOs.countTime(start) >= coffeeBreak:
                session.close()
                time.sleep(random.uniform(60, 80))
                start = utilsOs.countTime()
                # open the driver
                session = webdriver.Firefox()
                session.get(deepLUrl)
                time.sleep(random.uniform(1.3, 3.1))
                # log to deepL
                session = authentificateBtUseSelenium(user, passw, session)
            time.sleep(random.uniform(1.0, 1.5))
        # close the driver
        session.close()
        time.sleep(random.uniform(10.0, 15.0))
    return tokCount, iterCount
예제 #11
0
def applyNormalisationGetResult(testFilePath,
                                normOutPath=None,
                                ororazeOutput=(True, True),
                                useAbbrDict=False,
                                normalizationFunction=None,
                                *args):
    ''' 
	if normalizationFunction is none, then it will create the baseline otherwise 
	it will aplly the normalization function, ororaze it and evaluate the output 
	'''
    positiveEvalCounter = 0
    with open(testFilePath, u'r', encoding=u'utf8') as gsFile:
        #get total number of comments
        totalComments = utilsOs.countLines(gsFile) - 1
    #create an empty file for the norm
    if normOutPath != None:
        normFile = utilsOs.createEmptyFile(
            normOutPath,
            headerLine=
            u'Id\tEvaluation\tErrorTokens\tOriginal\tOutput\tGoldStandard')
        #create a separate folder for each column
        origFile = utilsOs.createEmptyFile(normOutPath.replace(
            u'.tsv', u'1Orig.tsv'),
                                           headerLine=u'Id\tOriginal')
        outFile = utilsOs.createEmptyFile(normOutPath.replace(
            u'.tsv', u'2Out.tsv'),
                                          headerLine=u'Id\tEvaluation\tOutput')
        goldFile = utilsOs.createEmptyFile(normOutPath.replace(
            u'.tsv', u'3Gold.tsv'),
                                           headerLine=u'Id\tGoldStandard')
    with open(testFilePath, u'r', encoding=u'utf8') as gsFile:
        #dispose of the header line
        header = gsFile.readline()
        #get first line
        line = gsFile.readline()
        #start an empty dejavuDict
        dejavuDict = {}
        #count and populate the norm
        while line:
            #get data
            lineList = (line.replace(u'\n', u'')).split(u'\t')
            commentId, originalComment, goldStandard = lineList
            normOutput = str(originalComment)
            #detect french feminin accord and fossilize the word by modifying its structure to something unchanged by the normalization function
            normOutput = frenchFemininAccordsCodification(originalComment,
                                                          isInput=True)
            #apply orora solution to abbreviations
            if useAbbrDict != False:
                if useAbbrDict != True:
                    normOutput = ororaZeAbbreviations(normOutput, useAbbrDict)
                else:
                    normOutput = ororaZeAbbreviations(normOutput)
            #apply the normalization function
            if normalizationFunction != None:
                normOutput, dejavuDict = normalizationFunction(
                    normOutput.lower(), dejavuDict, *args)
            #reverse back the code for the feminin accord into its original form
            normOutput = frenchFemininAccordsCodification(normOutput,
                                                          isInput=False)
            #get normalized output
            if ororazeOutput == True:
                normOutput = ororaZe(normOutput, advanced=True)
            elif type(ororazeOutput) is tuple or type(ororazeOutput) is list:
                if ororazeOutput[0] == True:
                    normOutput = ororaZe(normOutput, advanced=ororazeOutput[1])
            #evaluation    if the normalized output corresponds to the gold standard
            positiveEvalCounter, evaluation = normalizationEvaluator(
                normOutput, goldStandard, positiveEvalCounter)
            #get the tokens that do not correspond exactly and their edit distance
            errorTokList = utilsString.getcorrespondingTokensAndEditDist(
                normOutput, goldStandard) if evaluation == 0 else u'na'
            #dump
            if normOutPath != None:
                normFile.write(u'{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n'.format(
                    commentId, evaluation, errorTokList, originalComment,
                    normOutput, goldStandard))
                #dump to column separate files
                origFile.write(u'{0}\t{1}\n'.format(commentId,
                                                    originalComment))
                outFile.write(u'{0}\t{1}\t{2}\t{3}\n'.format(
                    commentId, evaluation, errorTokList, normOutput))
                goldFile.write(u'{0}\t{1}\n'.format(commentId, goldStandard))
            #next line
            line = gsFile.readline()
    #close the norm file
    if normOutPath != None:
        normFile.close()
        #close the other files
        origFile.close()
        outFile.close()
        goldFile.close()
        #dump the results
        resultsPath = u'{0}.results'.format(normOutPath.replace(u'.tsv', u''))
        utilsOs.dumpRawLines([
            u'NORMALIZATION RESULTS', u'exact positives: {0}/{1}'.format(
                positiveEvalCounter, totalComments), u'ratio: {0}'.format(
                    float(positiveEvalCounter) / float(totalComments))
        ], resultsPath)
    return {
        u'exact positives': positiveEvalCounter,
        u'total comments': totalComments,
        u'ratio': (float(positiveEvalCounter) / float(totalComments))
    }
def changeAnnotations(folderPathToReannotate,
                      annotationTochange=[u'0.3', u'1.1']):
    """ given a path where to find the annotation files, change the annotation (new)
    for the with a specific annotation (old) """
    # transform the annotation into a list if need be
    if type(annotationTochange) is str:
        annotationTochange = [annotationTochange]
    # get the annotation data
    sentEnList, sentFrList, sentRefList, sentAnnotList = getAnnotationData(
        folderPathToReannotate)
    # print the annotator cheat sheet
    printCheatSheet()
    # annotate only when we find the problematic old annotation
    for indexAnnot, oldAnnot in enumerate(list(sentAnnotList)):
        if oldAnnot in annotationTochange:
            src = sentEnList[indexAnnot] if u'en-fr' in sentRefList[
                indexAnnot] else sentFrList[indexAnnot]
            trgt = sentFrList[indexAnnot] if u'en-fr' in sentRefList[
                indexAnnot] else sentEnList[indexAnnot]
            print(u'{0} - {1}'.format(indexAnnot + 1, src))
            print(u'{0} - {1}'.format(indexAnnot + 1, trgt))
            # get the first part of the annotation (aligned or not)
            annotatorGeneralInput = input(
                u'Old annotation is {0}, what is the new one: '.format(
                    oldAnnot))
            # make sure to have the right general annotation
            while True:
                if annotatorGeneralInput in [
                        u'0', u'1', u'0.0', u'0.1', u'0.2', u'1.0', u'1.1',
                        u'1.2', u'1.3', u'1.4', u'c', u'correction'
                ]:
                    break
                else:
                    utilsOs.moveUpAndLeftNLines(1, slowly=False)
                    annotatorGeneralInput = input(u'Repeat annotation: ')
            if annotatorGeneralInput in [u'c', u'correct']:
                annotatorGeneralInput, sentAnnotList = correctionToAnnotation(
                    sentAnnotList)
            # if we still need to specify what type of alignment or misalignment
            if annotatorGeneralInput in [u'0', u'1']:
                utilsOs.moveUpAndLeftNLines(1, slowly=False)
                # get the second part of the annotation (aligned or not)
                annotatorSpecificInput = input(u'Specific type annotation: ')
                typeAnswers = [u'0', u'1', u'2'
                               ] if annotatorGeneralInput == 0 else [
                                   u'0', u'1', u'2', u'3', u'4'
                               ]
                # make sure to have the right specific annotation
                while True:
                    if annotatorSpecificInput in typeAnswers:
                        break
                    else:
                        utilsOs.moveUpAndLeftNLines(1, slowly=False)
                        annotatorSpecificInput = input(
                            u'Repeat type annotation: ')
                # save to the list of annotations
                sentAnnotList[indexAnnot] = u'{0}.{1}'.format(
                    annotatorGeneralInput, annotatorSpecificInput)
            # if the right answer was given in the right format right away
            else:
                # save to the list of annotations
                sentAnnotList[indexAnnot] = str(annotatorGeneralInput)
            # remove the lines from the terminal before getting to the next pair
            utilsOs.moveUpAndLeftNLines(3, slowly=False)
            # erase all remainder of the previous sentences and go back up again
            for e in range(2):
                print(u' ' * (max([len(src), len(trgt)]) + 6))
            utilsOs.moveUpAndLeftNLines(2, slowly=False)
    # remove format problematic annotations
    sentAnnotList = [
        annot if annot != u'1.1.0' else u'1.1' for annot in sentAnnotList
    ]
    sentAnnotList = [
        annot if annot != u'0.1.0' else u'0.1' for annot in sentAnnotList
    ]
    # dump new annotation
    sentAnnotPath = u'{0}sampleAnnotation.tsv'.format(folderPathToReannotate)
    utilsOs.dumpRawLines(sentAnnotList,
                         sentAnnotPath,
                         addNewline=True,
                         rewrite=True)
def annotateFilesAfterHeurAndSelection(inputFolderPath,
                                       outputFolderPath,
                                       dumpSP=True):
    """ given a folder path, where the reference, en line and fr line are alreade selected, annotate the SPs """
    # add a slash if needed
    if inputFolderPath[-1] != u'/':
        inputFolderPath = u'{0}/'.format(inputFolderPath)
    if outputFolderPath[-1] != u'/':
        outputFolderPath = u'{0}/'.format(outputFolderPath)
    # get the selected reference file lines
    with open(u'{0}sampleReference.Paths'.format(
            inputFolderPath)) as refPathsFile:
        referenceLines = refPathsFile.readlines()
    # get the en and fr input lines
    with open(u'{0}sample.en'.format(inputFolderPath)) as enFile:
        enLns = enFile.readlines()
    with open(u'{0}sample.fr'.format(inputFolderPath)) as frFile:
        frLns = frFile.readlines()
    with open(u'{0}scores.tsv'.format(inputFolderPath)) as scFile:
        scLns = scFile.readlines()
    # get rid of the files we have already annotated
    if utilsOs.theFileExists(
            u'{0}sampleReference.tsv'.format(outputFolderPath)):
        # get the already seen lines
        referencePathLine = utilsOs.readAllLinesFromFile(
            u'{0}sampleReference.tsv'.format(outputFolderPath),
            noNewLineChar=True)
        listOfAnnotations = utilsOs.readAllLinesFromFile(
            u'{0}sampleAnnotation.tsv'.format(outputFolderPath),
            noNewLineChar=True)
        # maintain only what we haven't seen
        annotatedFiles = set(referencePathLine)
        newRefLines = []
        for ind, file in enumerate(referenceLines):
            if file.replace(u'\n', u'') not in annotatedFiles:
                newRefLines.append([ind, file.replace(u'\n', u'')])
        referenceLines = newRefLines
        # print(referenceLines)
    else:
        referencePathLine = []
        listOfAnnotations = []
        referenceLines = [(ind, file.replace(u'\n', u''))
                          for ind, file in enumerate(referenceLines)]
    # print the annotator cheat sheet
    printCheatSheet()
    # open each file in EN and FR and show it in the terminal
    for tupleRef in referenceLines:
        indRef, refLn = tupleRef[0], tupleRef[1]
        print(u'############# {0} ##############'.format(
            refLn.replace(u'\n', u'')))
        # get the path for the source and target
        lnsSource = enLns if u'en-fr' in refLn else frLns
        lnsTarget = frLns if u'en-fr' in refLn else enLns
        # get the correct terminal line length
        lineLength = 137 - len(str(len(listOfAnnotations) + 1))
        # color in red the during lines
        redDuringSource = u'\033[1;31m{0}\033[0m'.format(lnsSource[indRef])
        # print the sentences
        print(u'{0} - {1}'.format(len(listOfAnnotations), redDuringSource))
        print(u'{0} - {1}'.format(len(listOfAnnotations), lnsTarget[indRef]))
        print()
        # count the lines that take the space of 2 lines
        longLines = getNbLongLines([lnsSource[indRef], lnsTarget[indRef]],
                                   lineLength)
        # get the first part of the annotation (aligned or not)
        annotatorGeneralInput = input(u'Aligned-Misaligned annotation: ')
        # make sure to have the right general annotation
        while True:
            if annotatorGeneralInput in [
                    u'0', u'1', u'0.0', u'0.1', u'0.2', u'1.0', u'1.1', u'1.2',
                    u'1.3', u'1.4', u'c', u'correction'
            ]:
                break
            else:
                utilsOs.moveUpAndLeftNLines(1, slowly=False)
                annotatorGeneralInput = input(u'Repeat annotation: ')
        if annotatorGeneralInput in [u'c', u'correct']:
            annotatorGeneralInput, listOfAnnotations = correctionToAnnotation(
                listOfAnnotations)
        # save to the list of annotations
        listOfAnnotations.append(float(annotatorGeneralInput))
        # remove the lines from the terminal before getting to the next pair
        utilsOs.moveUpAndLeftNLines(7 + longLines, slowly=False)
        # erase all remainder of the previous sentences and go back up again
        for e in range(14 + longLines):
            print(u' ' * (lineLength + 4))
        utilsOs.moveUpAndLeftNLines(7 + longLines, slowly=False)
        # append the reference to the file
        referencePathLine.append(refLn)
        # dump the file line by line, to be sure in case of error
        # dump the reference
        utilsOs.dumpRawLines(
            referencePathLine,
            u'{0}sampleReference.tsv'.format(outputFolderPath),
            addNewline=True,
            rewrite=True)
        # dump the annotation
        utilsOs.dumpRawLines(
            listOfAnnotations,
            u'{0}sampleAnnotation.tsv'.format(outputFolderPath),
            addNewline=True,
            rewrite=True)
        # dump the SP
        if dumpSP is True:
            enSent = lnsSource[indRef] if u'en-fr' in refLn else lnsTarget[
                indRef]
            frSent = lnsTarget[indRef] if u'en-fr' in refLn else lnsSource[
                indRef]
            utilsOs.appendLineToFile(enSent,
                                     u'{0}sample.en'.format(outputFolderPath),
                                     addNewLine=False)
            utilsOs.appendLineToFile(frSent,
                                     u'{0}sample.fr'.format(outputFolderPath),
                                     addNewLine=False)
            utilsOs.appendLineToFile(scLns[indRef],
                                     u'{0}scores.tsv'.format(outputFolderPath),
                                     addNewLine=False)
        # clear part of terminal
        utilsOs.moveUpAndLeftNLines(7, slowly=False)