def appendToDumpInGizaFormat(pathToEnFile, pathToFrFile, outPutPath, tokEnDict, tokFrDict, spFreqDict): with open(pathToEnFile) as enFile: with open(pathToFrFile) as frFile: enLn = enFile.readline() frLn = frFile.readline() while enLn: enLn = enLn.replace(u"\n", u"") frLn = frLn.replace(u"\n", u"") # get the freq of the sp spFreq = spFreqDict[u"{0}***---***{1}".format(enLn, frLn)] # get the sp in the form os id codes enIdString = transformStringToGizaFormat( enLn, tokEnDict, u"en", pathToEnFile) frIdString = transformStringToGizaFormat( frLn, tokFrDict, u"fr", pathToFrFile) # dump stringLine = u"{0}\n{1}\n{2}".format(spFreq, enIdString, frIdString) utilsOs.appendLineToFile(stringLine, outPutPath, addNewLine=True) # next line enLn = enFile.readline() frLn = frFile.readline()
def delEmptyLinesAndDump(inPath, outPath): with open(u'{0}extracted.fr'.format(inPath)) as ff: with open(u'{0}extracted.en'.format(inPath)) as ef: with open(u'{0}reference.tsv'.format(inPath)) as rf: with open(u'{0}scores.tsv'.format(inPath)) as sf: frLn = ff.readline() enLn = ef.readline() refLn = rf.readline() scLn = sf.readline() while frLn: copyFrLn = frLn.replace(u'\n', u'').replace(u'\t', u'').replace(u' ', u'') copyEnLn = enLn.replace(u'\n', u'').replace(u'\t', u'').replace(u' ', u'') if copyFrLn == u'' or copyEnLn == u'': pass else: utilsOs.appendLineToFile(frLn, u'{0}extracted.fr'.format(outPath), addNewLine=False) utilsOs.appendLineToFile(enLn, u'{0}extracted.en'.format(outPath), addNewLine=False) utilsOs.appendLineToFile(refLn, u'{0}reference.tsv'.format(outPath), addNewLine=False) utilsOs.appendLineToFile(scLn, u'{0}scores.tsv'.format(outPath), addNewLine=False) # next line frLn = ff.readline() enLn = ef.readline() refLn = rf.readline() scLn = sf.readline() return None
def randomSPselectionForAnnotation(enPath, frPath, refPath, scPath, outputFolderPath, nbSp=150): """ given a path to the tsv files in english, french and reference (probably where the heur. were applied), selects randomly and extracts to an output folder, ready to be annotated """ dejavus = set([]) if outputFolderPath[-1] != u'/': outputFolderPath = u'{0}/'.format(outputFolderPath) # open the output Files, overwrite previous if it already exists utilsOs.deleteAFile(u'{0}sample.en'.format(outputFolderPath)) utilsOs.deleteAFile(u'{0}sample.fr'.format(outputFolderPath)) utilsOs.deleteAFile(u'{0}sampleReference.Paths'.format(outputFolderPath)) utilsOs.deleteAFile(u'{0}scores.tsv'.format(outputFolderPath)) # get the reference lines with open(refPath) as refFile: refLns = refFile.readlines() lengthRef = len(refLns) refLns = None for n in range(nbSp): # select a random index that is not yet in dejavus rdmInd = getQuasiRandomIndexForcingOnSpecificRange(lengthRef, rangeMin=0, rangeMax=200000) while rdmInd in dejavus: rdmInd = getQuasiRandomIndexForcingOnSpecificRange(lengthRef, rangeMin=0, rangeMax=200000) dejavus.add(rdmInd) # search for that index in the en files refLn, enLn, frLn, scLn = getEnFrLnsForIndex(rdmInd, refPath, enPath, frPath, scPath) # dump in the output folder path utilsOs.appendLineToFile(enLn, u'{0}sample.en'.format(outputFolderPath), addNewLine=False) utilsOs.appendLineToFile(frLn, u'{0}sample.fr'.format(outputFolderPath), addNewLine=False) utilsOs.appendLineToFile(refLn, u'{0}sampleReference.Paths'.format(outputFolderPath), addNewLine=False) utilsOs.appendLineToFile(scLn, u'{0}scores.tsv'.format(outputFolderPath), addNewLine=False) return None
def changeStructure(): annotationFiles = utilsOs.goDeepGetFiles( u'./002manuallyAnnotated/oldOnes/MISALIGNED/', format=u'.tmx') for annotationPath in annotationFiles: origPath = u'/data/rali8/Tmp/rali/bt/burtrad/corpus_renamed/MISALIGNED/' + annotationPath.split( u'MISALIGNED/')[-1] srcPath = origPath + u'.en' trgtPath = origPath + u'.fr' with open(annotationPath) as file: fileLines = file.readlines() with open(srcPath) as src: srcLines = src.readlines() with open(trgtPath) as trgt: trgtLines = trgt.readlines() for i, anot in enumerate(fileLines): srcLn = srcLines[i] tgrtLn = trgtLines[i] # dump the reference referencePathLine = u'{0}\t{1}\n'.format(origPath, i) utilsOs.appendLineToFile( referencePathLine, u'./002manuallyAnnotated/sampleReference.tsv', addNewLine=False) # dump the annotation utilsOs.appendLineToFile( anot, u'./002manuallyAnnotated/sampleAnnotation.tsv', addNewLine=False) # dump the SP utilsOs.appendLineToFile(srcLn, u'./002manuallyAnnotated/sampleEn.tsv', addNewLine=False) utilsOs.appendLineToFile(tgrtLn, u'./002manuallyAnnotated/sampleFr.tsv', addNewLine=False)
def dumpReferenceToLangFiles(listOfRef, outputGeneralFilePath): """ given a lof of the references (original path, line index) dump the original lines into lang separated files """ outputGeneralFilePath = outputGeneralFilePath.replace(u'.tsv', u'') enOutputPath = u'{0}.en'.format(outputGeneralFilePath) frOutputPath = u'{0}.fr'.format(outputGeneralFilePath) # open each ref and get each line by lang for ref in listOfRef: pathIndex = ref.split(u'\t') enPath = u'{0}.en'.format(pathIndex[0]) frPath = u'{0}.fr'.format(pathIndex[0]) with open(enPath) as enFile: enLines = [line.replace(u'\n', u'') for line in enFile.readlines()] with open(frPath) as frFile: frLines = [line.replace(u'\n', u'') for line in frFile.readlines()] enLine = enLines[int(pathIndex[1])] frLine = frLines[int(pathIndex[1])] utilsOs.appendLineToFile(enLine, enOutputPath, addNewLine=True) utilsOs.appendLineToFile(frLine, frOutputPath, addNewLine=True)
def reformatFilesPreGiza(pathToEnFile, pathToFrFile, overwrite=True): """ make 2 vocabulary files (occurrence dict) in the format needed by giza++ or mgiza++ then reformats the corpus into a the format needed by giza++ or mgiza++ :param pathToEnFile: path to the english sentences file :param pathToFrFile: path to the french sentences file :return: None """ # prepare the output paths outputEnPath = prepareOutPutFile(pathToEnFile, fileName=u"sourceEn.vcb") outputFrPath = prepareOutPutFile(pathToFrFile, fileName=u"targetFr.vcb") outputPathGizaFormatCorpus = prepareOutPutFile( pathToEnFile, fileName=u"sentenceFile.giza") outputEnDictPath = prepareOutPutFile(pathToEnFile, fileName=u"en.json") outputFrDictPath = prepareOutPutFile(pathToEnFile, fileName=u"fr.json") outputSpDictPath = prepareOutPutFile(pathToEnFile, fileName=u"sp.json") outputEnIdDictPath = prepareOutPutFile(pathToEnFile, fileName=u"enId.json") outputFrIdDictPath = prepareOutPutFile(pathToEnFile, fileName=u"frId.json") # if there is not a file there yet, open the corpus Files, count the frequency of each token if overwrite is True or os.path.isfile(outputEnDictPath) is False: # make the frequency dict enTokFreqDict = makeFreqDict(pathToEnFile, lang=u"en") frTokFreqDict = makeFreqDict(pathToFrFile, lang=u"fr") # open the corpus files count the frequency of the sentence pairs spFreqDict = makeSPfreqDict(pathToEnFile, pathToFrFile) # sort the dict by freq orderedKeysValuesEn = sorted(enTokFreqDict.items(), key=lambda kv: (kv[1], kv[0]), reverse=True) orderedKeysValuesFr = sorted(frTokFreqDict.items(), key=lambda kv: (kv[1], kv[0]), reverse=True) # make the id dict enIdDict = makeIdDict(orderedKeysValuesEn) frIdDict = makeIdDict(orderedKeysValuesFr) # dump dicts utilsOs.dumpDictToJsonFile(enTokFreqDict, outputEnDictPath, overwrite) utilsOs.dumpDictToJsonFile(frTokFreqDict, outputFrDictPath, overwrite) utilsOs.dumpDictToJsonFile(spFreqDict, outputSpDictPath, overwrite) utilsOs.dumpDictToJsonFile(enIdDict, outputEnIdDictPath, overwrite) utilsOs.dumpDictToJsonFile(frIdDict, outputFrIdDictPath, overwrite) # if the file already exists or if overwrite is false else: enTokFreqDict = utilsOs.openJsonFileAsDict(outputEnDictPath) frTokFreqDict = utilsOs.openJsonFileAsDict(outputFrDictPath) spFreqDict = utilsOs.openJsonFileAsDict(outputSpDictPath) enIdDict = utilsOs.openJsonFileAsDict(outputEnIdDictPath) frIdDict = utilsOs.openJsonFileAsDict(outputFrIdDictPath) # sort the dict by freq orderedKeysValuesEn = sorted(enTokFreqDict.items(), key=lambda kv: (kv[1], kv[0]), reverse=True) orderedKeysValuesFr = sorted(frTokFreqDict.items(), key=lambda kv: (kv[1], kv[0]), reverse=True) # dump the empty tok voc file if overwrite is True: firstLine = u"1\tUNK\t0" utilsOs.createEmptyFile(outputEnPath, headerLine=firstLine) utilsOs.createEmptyFile(outputFrPath, headerLine=firstLine) utilsOs.createEmptyFile(outputPathGizaFormatCorpus) # dump the dict in the tok voc file for indKv, kv in enumerate(orderedKeysValuesEn): stringLine = u"{0}\t{1}\t{2}".format(indKv + 2, kv[0], kv[1]) utilsOs.appendLineToFile(stringLine, outputEnPath, addNewLine=True) for indKv, kv in enumerate(orderedKeysValuesFr): stringLine = u"{0}\t{1}\t{2}".format(indKv + 2, kv[0], kv[1]) utilsOs.appendLineToFile(stringLine, outputFrPath, addNewLine=True) # transform and dump the corpus into the GIZA format appendToDumpInGizaFormat(pathToEnFile, pathToFrFile, outputPathGizaFormatCorpus, enIdDict, frIdDict, spFreqDict) return outputEnPath, outputFrPath, outputPathGizaFormatCorpus, outputEnDictPath, outputFrDictPath, outputSpDictPath
def annotateFiles(listOfFilesPath=None, annotatedOutputFolder=u'./002manuallyAnnotated/', dumpSP=True): """ given a list of paths, manually show and annotate the sentence pairs """ referencePathLine = [] listOfAnnotations = [] # get the list containing the file paths if listOfFilesPath is None: listOfFilesPath = randomlySelectNDocsFromPath( b000path.getBtFolderPath(flagFolder=None), n=100) makeLocalFolderPaths(listOfFilesPath) elif type(listOfFilesPath) is str: if u'.json' in listOfFilesPath: listOfFilesPath = utilsOs.openJsonFileAsDict(listOfFilesPath) else: listOfFilesPath = [listOfFilesPath] # get rid of the files we have already annotated if utilsOs.theFileExists( u'{0}sampleReference.tsv'.format(annotatedOutputFolder)): refLines = utilsOs.readAllLinesFromFile( u'{0}sampleReference.tsv'.format(annotatedOutputFolder), noNewLineChar=True) annotatedFiles = set([line.split(u'\t')[0] for line in refLines]) listOfFilesPath = [ file for file in listOfFilesPath if file not in annotatedFiles ] # print the annotator cheat sheet print(""""0 - badly aligned \n\t0.0 - AMPLIFICATION: compensation, description, repetition or lang tendency to hypergraphy \n\t0.1 - ELISION: absence, omission, reduction or lang tendency to micrography \n\t0.2 - DISPLACEMENT: modification of the line order also modifying the order of the following lines \n\t0.3 - MISALIGNED and FOIBLE: alignment and quality errors \n1 - well aligned \n\t1.0 - ALIGNED and GOOD QUALITY: is aligned and shows no evident sing of translation imperfections \n\t1.1 - FOIBLE: imperfection in the translation quality""") # open each file in EN and FR and show it in the terminal for filePath in listOfFilesPath: print(u'############# {0} ##############'.format( filePath.replace( u'/data/rali8/Tmp/rali/bt/burtrad/corpus_renamed/', u''))) # get the path for the source and target fileSourcePath = u'{0}.fr'.format( filePath) if u'fr-en' in filePath else u'{0}.en'.format(filePath) fileTargetPath = u'{0}.en'.format( filePath) if u'fr-en' in filePath else u'{0}.fr'.format(filePath) with open(fileSourcePath) as fileSource: with open(fileTargetPath) as fileTarget: # show the context of the annotated sentence beforeSentSource = fileSource.readline() duringSentSource = fileSource.readline() beforeSentTarget = fileTarget.readline() duringSentTarget = fileTarget.readline() # annotate the first sentence pair listOfAnnotations = annotateFirstSP(beforeSentSource, duringSentSource, beforeSentTarget, duringSentTarget, listOfAnnotations, lineLength=137) # save the reference # if the filepath is the reference if u'burtrad' in filePath: referencePathLine.append(u'{0}\t{1}'.format(filePath, 0)) # otherwise we get it from a reference file else: with open(u'{0}.tsv'.format(filePath)) as refFile: refLns = [ ln.replace(u'\n', u'') for ln in refFile.readlines() ] referencePathLine.append(refLns[0]) # dump the first SP if dumpSP is True: enSent = beforeSentSource if u'.en' in fileSourcePath else beforeSentTarget frSent = beforeSentTarget if u'.en' in fileSourcePath else beforeSentSource utilsOs.appendLineToFile( enSent, u'{0}sample.en'.format(annotatedOutputFolder), addNewLine=False) utilsOs.appendLineToFile( frSent, u'{0}sample.fr'.format(annotatedOutputFolder), addNewLine=False) duringIndex = 1 # for each line while duringSentSource or duringSentTarget: # get the correct terminal line length lineLength = 137 - len(str(len(listOfAnnotations) + 1)) # get the sentences afterSentSource = fileSource.readline() afterSentTarget = fileTarget.readline() # color in red the during lines redDuringSource = u'\033[1;31m{0}\033[0m'.format( duringSentSource) redDuringTarget = u'\033[1;31m{0}\033[0m'.format( duringSentTarget) # print the sentences print(u'{0} - {1}'.format( len(listOfAnnotations) - 1, beforeSentSource)) print(u'{0} - {1}'.format( len(listOfAnnotations) - 1, beforeSentTarget)) print(u'{0} - {1}'.format(len(listOfAnnotations), redDuringSource)) print(u'{0} - {1}'.format(len(listOfAnnotations), redDuringTarget)) print(u'{0} - {1}'.format( len(listOfAnnotations) + 1, afterSentSource)) print(u'{0} - {1}'.format( len(listOfAnnotations) + 1, afterSentTarget)) print() # count if the lines that take the space of 2 lines longLines = getNbLongLines([ beforeSentSource, beforeSentTarget, duringSentSource, duringSentTarget, afterSentSource, afterSentTarget ], lineLength) # get the first part of the annotation (aligned or not) annotatorGeneralInput = input( u'Aligned-Misaligned annotation: ') # make sure to have the right general annotation while True: if annotatorGeneralInput in [ u'0', u'1', u'0.0', u'0.1', u'0.2', u'0.3', u'1.0', u'1.1', u'c', u'correct' ]: break else: utilsOs.moveUpAndLeftNLines(1, slowly=False) annotatorGeneralInput = input( u'Repeat annotation: ') if annotatorGeneralInput in [u'c', u'correct']: annotatorGeneralInput, listOfAnnotations = correctionToAnnotation( listOfAnnotations) # if we still need to specify what type of alignment or misalignment if annotatorGeneralInput in [u'0', u'1']: utilsOs.moveUpAndLeftNLines(1, slowly=False) # get the second part of the annotation (aligned or not) annotatorSpecificInput = input( u'Specific type annotation: ') typeAnswers = [ u'0', u'1', u'2', u'3' ] if annotatorGeneralInput == 0 else [u'0', u'1'] # make sure to have the right specific annotation while True: if annotatorSpecificInput in typeAnswers: break else: utilsOs.moveUpAndLeftNLines(1, slowly=False) annotatorSpecificInput = input( u'Repeat type annotation: ') # save to the list of annotations listOfAnnotations.append( float(u'{0}.{1}'.format(annotatorGeneralInput, annotatorSpecificInput))) # if the right answer was given in the right format right away else: # save to the list of annotations listOfAnnotations.append(float(annotatorGeneralInput)) # remove the lines from the terminal before getting to the next pair utilsOs.moveUpAndLeftNLines(14 + longLines, slowly=False) # erase all remainder of the previous sentences and go back up again for e in range(14 + longLines): print(u' ' * (lineLength + 4)) utilsOs.moveUpAndLeftNLines(14 + longLines, slowly=False) # next line source beforeSentSource = duringSentSource duringSentSource = afterSentSource # next line target beforeSentTarget = duringSentTarget duringSentTarget = afterSentTarget # append the reference to the file # if the filepath is the reference if u'burtrad' in filePath: referencePathLine.append(u'{0}\t{1}'.format( filePath, duringIndex)) # otherwise we get it from a reference file else: with open(u'{0}.tsv'.format(filePath)) as refFile: refLns = [ ln.replace(u'\n', u'') for ln in refFile.readlines() ] referencePathLine.append(refLns[duringIndex]) # add 1 to index duringIndex += 1 # dump the file line by line, to be sure in case of error # dump the reference utilsOs.dumpRawLines(referencePathLine, u'{0}sampleReference.tsv'.format( annotatedOutputFolder), addNewline=True, rewrite=True) # dump the annotation utilsOs.dumpRawLines(listOfAnnotations, u'{0}sampleAnnotation.tsv'.format( annotatedOutputFolder), addNewline=True, rewrite=True) # dump the SP if dumpSP is True: enSent = beforeSentSource if u'.en' in fileSourcePath else beforeSentTarget frSent = beforeSentTarget if u'.en' in fileSourcePath else beforeSentSource utilsOs.appendLineToFile( enSent, u'{0}sample.en'.format(annotatedOutputFolder), addNewLine=False) utilsOs.appendLineToFile( frSent, u'{0}sample.fr'.format(annotatedOutputFolder), addNewLine=False) # clear part of terminal utilsOs.moveUpAndLeftNLines(2, slowly=False)
def launchForOneDay(tokLimit=4000, outputFolderPath=u"/data/rali5/Tmp/alfonsda/workRali/004tradBureau/017deepLTranslatedCorpus/", coffeeBreak=1650): """ launches the deepL bot for one day's worth :param tokLimit: maximum number of tokens to treat in the day :param outputFolderPath: path to the folder where will be output the files :param coffeeBreak: time in seconds when to take a break and start a new deppL session :return: tokCount: number of total tokens translated """ start = utilsOs.countTime() # path to the referencer, indicating where we left off: path and last index worked referencerPath = u"/data/rali5/Tmp/alfonsda/workRali/004tradBureau/017deepLTranslatedCorpus/trRef" # info deepLUrl = u"https://www.deepl.com/translator" mUser, mPass, sUser, sPass = b000path.getDeepLProfileInfo() # for each user for user, passw in zip([sUser, mUser], [sPass, mPass]): tokCount = 0 # open the driver session = webdriver.Firefox() session.get(deepLUrl) time.sleep(random.uniform(1.3, 3.1)) # log to deepL session = authentificateBtUseSelenium(user, passw, session) # while we have not gone over the daily limit iterCount = 0 while tokCount < (tokLimit-10): # get the sp sp, filePath, fileIndex, refLns = getANewSpWhereWeLeftOff(referencerPath) session, nbOfTok, enFrTranslAndAlt, frEnTranslAndAlt, timeEn, timeFr = translateSpGetResult(session, sp) # dump the referencer lines utilsOs.dumpRawLines(refLns, referencerPath, addNewline=False, rewrite=True) # dump original sp utilsOs.appendLineToFile(sp[0], u"{0}originalSent.en".format(outputFolderPath), addNewLine=True) utilsOs.appendLineToFile(sp[1], u"{0}originalSent.fr".format(outputFolderPath), addNewLine=True) # dump translation and variants utilsOs.appendLineToFile(enFrTranslAndAlt, u"{0}translated.en2fr".format(outputFolderPath), addNewLine=True) utilsOs.appendLineToFile(frEnTranslAndAlt, u"{0}translated.fr2en".format(outputFolderPath), addNewLine=True) # dump reference utilsOs.appendLineToFile(u"{0}\t{1}\n".format(filePath, fileIndex), u"{0}reference.tsv".format(outputFolderPath), addNewLine=False) # dump timestamp utilsOs.appendLineToFile(u"{0}\tlocal time: {1}".format(timeEn, transformTimeToLocalTime(timeEn)), u"{0}timestamp.en".format(outputFolderPath), addNewLine=True) utilsOs.appendLineToFile(u"{0}\tlocal time: {1}".format(timeFr, transformTimeToLocalTime(timeFr)), u"{0}timestamp.fr".format(outputFolderPath), addNewLine=True) # add number of tokens tokCount += nbOfTok # add nb of iterations iterCount += 1 # take a coffee break if it's time if coffeeBreak is not None and utilsOs.countTime(start) >= coffeeBreak: session.close() time.sleep(random.uniform(60, 80)) start = utilsOs.countTime() # open the driver session = webdriver.Firefox() session.get(deepLUrl) time.sleep(random.uniform(1.3, 3.1)) # log to deepL session = authentificateBtUseSelenium(user, passw, session) time.sleep(random.uniform(1.0, 1.5)) # close the driver session.close() time.sleep(random.uniform(10.0, 15.0)) return tokCount, iterCount
tmxFilePaths = [ u'{0}{1}'.format(commandPath, fn) for fn in tmxFileNames ] # get the metadata from the tmxFile for tmxData, tmxPath in zip( getTmxFlaggedData(tmxFilePaths), tmxFilePaths): # if there is no flagging data in the tmx, pass if tmxData is None: pass # get and dump the flagging data else: (flagType, index, enSent, frSent, segmentNb, flagDate) = tmxData utilsOs.appendLineToFile( enSent, u'{0}problematic/extracted.en'.format( btExtractPath)) utilsOs.appendLineToFile( frSent, u'{0}problematic/extracted.fr'.format( btExtractPath)) utilsOs.appendLineToFile( u'{0}\t{1}'.format(commandPath, index), u'{0}problematic/referenceDC24Corpus.tsv'. format(btExtractPath)) utilsOs.appendLineToFile( u'{0}\t{1}\t{2}'.format( flagType, segmentNb, flagDate), u'{0}problematic/other.tsv'.format( btExtractPath)) # map the DC24 to the archive1 files
def makeAMixOf2Annotations(inputAnnotPath1, inputAnnotPath2, outputMixPath): """ Given 2 annotations, makes a third annotation made of a mix of the two others. :param inputAnnot1: path to the first annotation folder :param inputAnnot2: path to the second annotation folder :param outputMix: path to the mix annotation folder :return: """ # make sure the paths end in a slash if inputAnnotPath1[-1] != u'/': inputAnnotPath1 = u'{0}/'.format(inputAnnotPath1) if inputAnnotPath2[-1] != u'/': inputAnnotPath2 = u'{0}/'.format(inputAnnotPath2) if outputMixPath[-1] != u'/': outputMixPath = u'{0}/'.format(outputMixPath) # for each input open for inPath in [inputAnnotPath1, inputAnnotPath2]: # open the file, read the lines with open(u'{0}sample.en'.format(inPath)) as inEnFile: enLns = inEnFile.readlines() with open(u'{0}sample.fr'.format(inPath)) as inFrFile: frLns = inFrFile.readlines() with open(u'{0}sampleAnnotation.tsv'.format(inPath)) as inAnnotFile: annotLns = inAnnotFile.readlines() with open(u'{0}sampleReference.tsv'.format(inPath)) as inRefFile: refLns = inRefFile.readlines() with open(u'{0}scores.tsv'.format(inPath)) as inScFile: scLns = inScFile.readlines() with open(u'{0}scoresAndMetaData.tsv'.format(inPath)) as inScMetaFile: scMetaLns = inScMetaFile.readlines() # choose and index randomly dejaVus = set([]) while len(dejaVus) < int(len(enLns) / 2.0): randomInd = randint(0, len(enLns) - 1) while randomInd in dejaVus: randomInd = randint(0, len(enLns) - 1) # add to dejavus dejaVus.add(randomInd) # dump to output file utilsOs.appendLineToFile(enLns[randomInd], u'{0}sample.en'.format(outputMixPath), addNewLine=False) utilsOs.appendLineToFile(frLns[randomInd], u'{0}sample.fr'.format(outputMixPath), False) utilsOs.appendLineToFile( annotLns[randomInd], u'{0}sampleAnnotation.tsv'.format(outputMixPath), False) utilsOs.appendLineToFile( refLns[randomInd], u'{0}sampleReference.tsv'.format(outputMixPath), False) utilsOs.appendLineToFile(scLns[randomInd], u'{0}scores.tsv'.format(outputMixPath), False) utilsOs.appendLineToFile( scMetaLns[randomInd], u'{0}scoresAndMetaData.tsv'.format(outputMixPath), False)
def annotateFilesAfterHeurAndSelection(inputFolderPath, outputFolderPath, dumpSP=True): """ given a folder path, where the reference, en line and fr line are alreade selected, annotate the SPs """ # add a slash if needed if inputFolderPath[-1] != u'/': inputFolderPath = u'{0}/'.format(inputFolderPath) if outputFolderPath[-1] != u'/': outputFolderPath = u'{0}/'.format(outputFolderPath) # get the selected reference file lines with open(u'{0}sampleReference.Paths'.format( inputFolderPath)) as refPathsFile: referenceLines = refPathsFile.readlines() # get the en and fr input lines with open(u'{0}sample.en'.format(inputFolderPath)) as enFile: enLns = enFile.readlines() with open(u'{0}sample.fr'.format(inputFolderPath)) as frFile: frLns = frFile.readlines() with open(u'{0}scores.tsv'.format(inputFolderPath)) as scFile: scLns = scFile.readlines() # get rid of the files we have already annotated if utilsOs.theFileExists( u'{0}sampleReference.tsv'.format(outputFolderPath)): # get the already seen lines referencePathLine = utilsOs.readAllLinesFromFile( u'{0}sampleReference.tsv'.format(outputFolderPath), noNewLineChar=True) listOfAnnotations = utilsOs.readAllLinesFromFile( u'{0}sampleAnnotation.tsv'.format(outputFolderPath), noNewLineChar=True) # maintain only what we haven't seen annotatedFiles = set(referencePathLine) newRefLines = [] for ind, file in enumerate(referenceLines): if file.replace(u'\n', u'') not in annotatedFiles: newRefLines.append([ind, file.replace(u'\n', u'')]) referenceLines = newRefLines # print(referenceLines) else: referencePathLine = [] listOfAnnotations = [] referenceLines = [(ind, file.replace(u'\n', u'')) for ind, file in enumerate(referenceLines)] # print the annotator cheat sheet printCheatSheet() # open each file in EN and FR and show it in the terminal for tupleRef in referenceLines: indRef, refLn = tupleRef[0], tupleRef[1] print(u'############# {0} ##############'.format( refLn.replace(u'\n', u''))) # get the path for the source and target lnsSource = enLns if u'en-fr' in refLn else frLns lnsTarget = frLns if u'en-fr' in refLn else enLns # get the correct terminal line length lineLength = 137 - len(str(len(listOfAnnotations) + 1)) # color in red the during lines redDuringSource = u'\033[1;31m{0}\033[0m'.format(lnsSource[indRef]) # print the sentences print(u'{0} - {1}'.format(len(listOfAnnotations), redDuringSource)) print(u'{0} - {1}'.format(len(listOfAnnotations), lnsTarget[indRef])) print() # count the lines that take the space of 2 lines longLines = getNbLongLines([lnsSource[indRef], lnsTarget[indRef]], lineLength) # get the first part of the annotation (aligned or not) annotatorGeneralInput = input(u'Aligned-Misaligned annotation: ') # make sure to have the right general annotation while True: if annotatorGeneralInput in [ u'0', u'1', u'0.0', u'0.1', u'0.2', u'1.0', u'1.1', u'1.2', u'1.3', u'1.4', u'c', u'correction' ]: break else: utilsOs.moveUpAndLeftNLines(1, slowly=False) annotatorGeneralInput = input(u'Repeat annotation: ') if annotatorGeneralInput in [u'c', u'correct']: annotatorGeneralInput, listOfAnnotations = correctionToAnnotation( listOfAnnotations) # save to the list of annotations listOfAnnotations.append(float(annotatorGeneralInput)) # remove the lines from the terminal before getting to the next pair utilsOs.moveUpAndLeftNLines(7 + longLines, slowly=False) # erase all remainder of the previous sentences and go back up again for e in range(14 + longLines): print(u' ' * (lineLength + 4)) utilsOs.moveUpAndLeftNLines(7 + longLines, slowly=False) # append the reference to the file referencePathLine.append(refLn) # dump the file line by line, to be sure in case of error # dump the reference utilsOs.dumpRawLines( referencePathLine, u'{0}sampleReference.tsv'.format(outputFolderPath), addNewline=True, rewrite=True) # dump the annotation utilsOs.dumpRawLines( listOfAnnotations, u'{0}sampleAnnotation.tsv'.format(outputFolderPath), addNewline=True, rewrite=True) # dump the SP if dumpSP is True: enSent = lnsSource[indRef] if u'en-fr' in refLn else lnsTarget[ indRef] frSent = lnsTarget[indRef] if u'en-fr' in refLn else lnsSource[ indRef] utilsOs.appendLineToFile(enSent, u'{0}sample.en'.format(outputFolderPath), addNewLine=False) utilsOs.appendLineToFile(frSent, u'{0}sample.fr'.format(outputFolderPath), addNewLine=False) utilsOs.appendLineToFile(scLns[indRef], u'{0}scores.tsv'.format(outputFolderPath), addNewLine=False) # clear part of terminal utilsOs.moveUpAndLeftNLines(7, slowly=False)