def makeARefFile(rootFolder=u"/data/rali8/Tmp/rali/bt/burtrad/corpus_renamed/", refFilePath=u"/data/rali5/Tmp/alfonsda/workRali/004tradBureau/017deepLTranslatedCorpus/trRef"): # make sure the files does not yet exists if utilsOs.theFileExists(refFilePath) is True: return None utilsOs.createEmptyFile(refFilePath) listOfFiles = utilsOs.goDeepGetFiles(u"/data/rali8/Tmp/rali/bt/burtrad/corpus_renamed/", format=u".tmx.en") with open(refFilePath, u"a") as refFile: for filePath in listOfFiles: refFile.write(u"{0}\t-1\n".format(filePath.replace(u".tmx.en", u".tmx")))
def getANewSpWhereWeLeftOff(refPath=u"/data/rali5/Tmp/alfonsda/workRali/004tradBureau/017deepLTranslatedCorpus/trRef"): # check if the ref file already exists if utilsOs.theFileExists(refPath) is False: utilsOs.createEmptyFile(refPath) # open the reference file lastSeenIndex, lastSeenPath = None, None with open(refPath) as ref: # first line refLns = ref.readlines() refIndex = 0 for refLn in refLns: refList = refLn.replace(u"\n", u"").split(u"\t") # test if we have an index for the path try: lastSeenIndex = int(refList[1]) lastSeenPath = refList[0] break # if there is no integral, then it saw all lns for that path except ValueError: pass # next ref index refIndex += 1 # open the last seen file at the (last seen index + 1) and return the sp in the en and fr files if lastSeenIndex is None: return None with open(u"{0}.en".format(lastSeenPath)) as enFile: with open(u"{0}.fr".format(lastSeenPath)) as frFile: enLn = enFile.readline() frLn = frFile.readline() indexLn = 0 while enLn: if indexLn == lastSeenIndex+1: # replace the line with its next index and dump the ref file refLns[refIndex] = u"{0}\t{1}\n".format(lastSeenPath, indexLn) # return the sentence pair return [enLn.replace(u"\n", u""), frLn.replace(u"\n", u"")], lastSeenPath, indexLn, refLns # next line enLn = enFile.readline() frLn = frFile.readline() indexLn += 1 # if we went over the whole document and it ended, change the ref line, dump it and start over refLns[refIndex] = u"{0}\tdone\n".format(lastSeenPath) utilsOs.dumpRawLines(refLns, refPath, addNewline=False, rewrite=True) return getANewSpWhereWeLeftOff(refPath)
def modifyConfigAndIndexFiles(pathToTheExportationEnvironment): ''' given the path to the sigma.js exportation environment (ending in the folder "network/"), it changes the config.json file and the index.html file so they show the graph the way intended ''' #copying config.json file configContent = {"type": "network","version": "1.0","data": "data.json","logo": {"file": "","link": "","text": ""},"text": {"more": "","intro": "","title": ""},"legend": {"edgeLabel": "","colorLabel": "","nodeLabel": ""},"features": {"search": True,"groupSelectorAttribute": True,"hoverBehavior": "default"},"informationPanel": {"groupByEdgeDirection": True,"imageAttribute": False},"sigma": {"drawingProperties": {"defaultEdgeType": "curve","defaultHoverLabelBGColor": "#002147","defaultLabelBGColor": "#ddd","activeFontStyle": "bold","defaultLabelColor": "#000","labelThreshold": 999,"defaultLabelHoverColor": "#fff","fontStyle": "bold","hoverFontStyle": "bold","defaultLabelSize": 14},"graphProperties": {"maxEdgeSize": 2,"minEdgeSize": 2,"minNodeSize": 0.25,"maxNodeSize": 2.5},"mouseProperties": {"maxRatio": 20,"minRatio": 0.75}}} pathConfigJson = u'{0}config.json'.format(pathToTheExportationEnvironment) if utilsOs.theFileExists(pathConfigJson) == True: os.remove(pathConfigJson) utilsOs.dumpDictToJsonFile(configContent, pathConfigJson) #getting the color information from the data file colorCommunityDict = {} dataDict = utilsOs.openJsonFileAsDict(u'{0}data.json'.format(pathToTheExportationEnvironment)) for nodeDict in dataDict[u'nodes']: try: if nodeDict[u'attributes'][u'community_lvl_0'] not in colorCommunityDict: colorCommunityDict[nodeDict[u'attributes'][u'community_lvl_0']] = u'\t\t\t<div style="color: {0};">● {1}</div>\n'.format(nodeDict[u'color'], nodeDict[u'attributes'][u'infered_community_name_lvl_0']) ''' ##################################################### #before I changed the names of the columns if nodeDict[u'attributes'][u'community'] not in colorCommunityDict: colorCommunityDict[nodeDict[u'attributes'][u'community']] = u'\t\t\t<div style="color: {0};">● {1}</div>\n'.format(nodeDict[u'color'], nodeDict[u'attributes'][u'infered_community_name']) ''' except KeyError: pass #modifying the index.html file with open(u'{0}index.html'.format(pathToTheExportationEnvironment)) as indexFile: fileLines = indexFile.readlines() for index, line in enumerate(fileLines): if line == u'\t\t<dt class="colours"></dt>\n': indexDivisor = index + 1 break fileLines = fileLines[:indexDivisor] + [u'\t\t<dd>\n'] + list(colorCommunityDict.values()) + [u'\t\t</dd>\n'] + fileLines[indexDivisor+1:] utilsOs.dumpRawLines(fileLines, u'{0}index.html'.format(pathToTheExportationEnvironment), addNewline=False, rewrite=True)
from selenium import webdriver from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import sys, time, os, shutil sys.path.append(u'../utils') sys.path.append(u'./utils') import utilsOs, utilsString outputPath = u'/data/rali5/Tmp/alfonsda/workRali/004tradBureau/008originalDocumentsBt/' tempPath = u'{0}tmp/'.format(outputPath) outputDejaVusDocs = u'{0}json/dejaVusDocs.json'.format(outputPath) outputDejaVusRequestId = u'{0}json/dejaVusRequests.json'.format(outputPath) if utilsOs.theFileExists(outputDejaVusRequestId) is True: dejaVu = utilsOs.openJsonFileAsDict(outputDejaVusRequestId) else: dejaVu = set([]) # count the time the algorithm takes to run startTime = utilsOs.countTime() # get url, user and password with open(u'./b011pathUserPassword') as pup: url = pup.readline().replace(u'\n', u'') user = pup.readline().replace(u'\n', u'') passw = pup.readline().replace(u'\n', u'') # open the driver go to the page profile = webdriver.FirefoxProfile()
def annotateFiles(listOfFilesPath=None, annotatedOutputFolder=u'./002manuallyAnnotated/', dumpSP=True): """ given a list of paths, manually show and annotate the sentence pairs """ referencePathLine = [] listOfAnnotations = [] # get the list containing the file paths if listOfFilesPath is None: listOfFilesPath = randomlySelectNDocsFromPath( b000path.getBtFolderPath(flagFolder=None), n=100) makeLocalFolderPaths(listOfFilesPath) elif type(listOfFilesPath) is str: if u'.json' in listOfFilesPath: listOfFilesPath = utilsOs.openJsonFileAsDict(listOfFilesPath) else: listOfFilesPath = [listOfFilesPath] # get rid of the files we have already annotated if utilsOs.theFileExists( u'{0}sampleReference.tsv'.format(annotatedOutputFolder)): refLines = utilsOs.readAllLinesFromFile( u'{0}sampleReference.tsv'.format(annotatedOutputFolder), noNewLineChar=True) annotatedFiles = set([line.split(u'\t')[0] for line in refLines]) listOfFilesPath = [ file for file in listOfFilesPath if file not in annotatedFiles ] # print the annotator cheat sheet print(""""0 - badly aligned \n\t0.0 - AMPLIFICATION: compensation, description, repetition or lang tendency to hypergraphy \n\t0.1 - ELISION: absence, omission, reduction or lang tendency to micrography \n\t0.2 - DISPLACEMENT: modification of the line order also modifying the order of the following lines \n\t0.3 - MISALIGNED and FOIBLE: alignment and quality errors \n1 - well aligned \n\t1.0 - ALIGNED and GOOD QUALITY: is aligned and shows no evident sing of translation imperfections \n\t1.1 - FOIBLE: imperfection in the translation quality""") # open each file in EN and FR and show it in the terminal for filePath in listOfFilesPath: print(u'############# {0} ##############'.format( filePath.replace( u'/data/rali8/Tmp/rali/bt/burtrad/corpus_renamed/', u''))) # get the path for the source and target fileSourcePath = u'{0}.fr'.format( filePath) if u'fr-en' in filePath else u'{0}.en'.format(filePath) fileTargetPath = u'{0}.en'.format( filePath) if u'fr-en' in filePath else u'{0}.fr'.format(filePath) with open(fileSourcePath) as fileSource: with open(fileTargetPath) as fileTarget: # show the context of the annotated sentence beforeSentSource = fileSource.readline() duringSentSource = fileSource.readline() beforeSentTarget = fileTarget.readline() duringSentTarget = fileTarget.readline() # annotate the first sentence pair listOfAnnotations = annotateFirstSP(beforeSentSource, duringSentSource, beforeSentTarget, duringSentTarget, listOfAnnotations, lineLength=137) # save the reference # if the filepath is the reference if u'burtrad' in filePath: referencePathLine.append(u'{0}\t{1}'.format(filePath, 0)) # otherwise we get it from a reference file else: with open(u'{0}.tsv'.format(filePath)) as refFile: refLns = [ ln.replace(u'\n', u'') for ln in refFile.readlines() ] referencePathLine.append(refLns[0]) # dump the first SP if dumpSP is True: enSent = beforeSentSource if u'.en' in fileSourcePath else beforeSentTarget frSent = beforeSentTarget if u'.en' in fileSourcePath else beforeSentSource utilsOs.appendLineToFile( enSent, u'{0}sample.en'.format(annotatedOutputFolder), addNewLine=False) utilsOs.appendLineToFile( frSent, u'{0}sample.fr'.format(annotatedOutputFolder), addNewLine=False) duringIndex = 1 # for each line while duringSentSource or duringSentTarget: # get the correct terminal line length lineLength = 137 - len(str(len(listOfAnnotations) + 1)) # get the sentences afterSentSource = fileSource.readline() afterSentTarget = fileTarget.readline() # color in red the during lines redDuringSource = u'\033[1;31m{0}\033[0m'.format( duringSentSource) redDuringTarget = u'\033[1;31m{0}\033[0m'.format( duringSentTarget) # print the sentences print(u'{0} - {1}'.format( len(listOfAnnotations) - 1, beforeSentSource)) print(u'{0} - {1}'.format( len(listOfAnnotations) - 1, beforeSentTarget)) print(u'{0} - {1}'.format(len(listOfAnnotations), redDuringSource)) print(u'{0} - {1}'.format(len(listOfAnnotations), redDuringTarget)) print(u'{0} - {1}'.format( len(listOfAnnotations) + 1, afterSentSource)) print(u'{0} - {1}'.format( len(listOfAnnotations) + 1, afterSentTarget)) print() # count if the lines that take the space of 2 lines longLines = getNbLongLines([ beforeSentSource, beforeSentTarget, duringSentSource, duringSentTarget, afterSentSource, afterSentTarget ], lineLength) # get the first part of the annotation (aligned or not) annotatorGeneralInput = input( u'Aligned-Misaligned annotation: ') # make sure to have the right general annotation while True: if annotatorGeneralInput in [ u'0', u'1', u'0.0', u'0.1', u'0.2', u'0.3', u'1.0', u'1.1', u'c', u'correct' ]: break else: utilsOs.moveUpAndLeftNLines(1, slowly=False) annotatorGeneralInput = input( u'Repeat annotation: ') if annotatorGeneralInput in [u'c', u'correct']: annotatorGeneralInput, listOfAnnotations = correctionToAnnotation( listOfAnnotations) # if we still need to specify what type of alignment or misalignment if annotatorGeneralInput in [u'0', u'1']: utilsOs.moveUpAndLeftNLines(1, slowly=False) # get the second part of the annotation (aligned or not) annotatorSpecificInput = input( u'Specific type annotation: ') typeAnswers = [ u'0', u'1', u'2', u'3' ] if annotatorGeneralInput == 0 else [u'0', u'1'] # make sure to have the right specific annotation while True: if annotatorSpecificInput in typeAnswers: break else: utilsOs.moveUpAndLeftNLines(1, slowly=False) annotatorSpecificInput = input( u'Repeat type annotation: ') # save to the list of annotations listOfAnnotations.append( float(u'{0}.{1}'.format(annotatorGeneralInput, annotatorSpecificInput))) # if the right answer was given in the right format right away else: # save to the list of annotations listOfAnnotations.append(float(annotatorGeneralInput)) # remove the lines from the terminal before getting to the next pair utilsOs.moveUpAndLeftNLines(14 + longLines, slowly=False) # erase all remainder of the previous sentences and go back up again for e in range(14 + longLines): print(u' ' * (lineLength + 4)) utilsOs.moveUpAndLeftNLines(14 + longLines, slowly=False) # next line source beforeSentSource = duringSentSource duringSentSource = afterSentSource # next line target beforeSentTarget = duringSentTarget duringSentTarget = afterSentTarget # append the reference to the file # if the filepath is the reference if u'burtrad' in filePath: referencePathLine.append(u'{0}\t{1}'.format( filePath, duringIndex)) # otherwise we get it from a reference file else: with open(u'{0}.tsv'.format(filePath)) as refFile: refLns = [ ln.replace(u'\n', u'') for ln in refFile.readlines() ] referencePathLine.append(refLns[duringIndex]) # add 1 to index duringIndex += 1 # dump the file line by line, to be sure in case of error # dump the reference utilsOs.dumpRawLines(referencePathLine, u'{0}sampleReference.tsv'.format( annotatedOutputFolder), addNewline=True, rewrite=True) # dump the annotation utilsOs.dumpRawLines(listOfAnnotations, u'{0}sampleAnnotation.tsv'.format( annotatedOutputFolder), addNewline=True, rewrite=True) # dump the SP if dumpSP is True: enSent = beforeSentSource if u'.en' in fileSourcePath else beforeSentTarget frSent = beforeSentTarget if u'.en' in fileSourcePath else beforeSentSource utilsOs.appendLineToFile( enSent, u'{0}sample.en'.format(annotatedOutputFolder), addNewLine=False) utilsOs.appendLineToFile( frSent, u'{0}sample.fr'.format(annotatedOutputFolder), addNewLine=False) # clear part of terminal utilsOs.moveUpAndLeftNLines(2, slowly=False)
def applyHeuristicsOnNotFlaggedCorpus(filesIndexes, launchId, heuristicsList=None): """ given a corpus and heuristic indication, it applies the heuristic to that corpus and dumps the result """ out = u'/data/rali5/Tmp/alfonsda/workRali/004tradBureau/006appliedHeuristics/NOT-FLAGGED/{0}/'.format( launchId) if heuristicsList is None: heuristicsList = [ u'nb', u'cog', u'len', u'fa', u'ion', u'sw', u'spell', u'url', u'mono', u'tabl', u'strBcks', u'punct', u'gibb' ] starbucksExprDict, starbucksWordDict = utilsString.openEn2FrStarbucksDict() # make the folder utilsOs.createEmptyFolder(out) # reference file outputRefPath = u'{0}reference.tsv'.format(out) referenceAlreadyExists = utilsOs.theFileExists(outputRefPath) # get the list of ALL the file paths filePathList, subsetIndexes = getSubsetOfFiles(filesIndexes) # open the reference files with open(outputRefPath, u'a') as refFile: # for each tmx file for indexTmx, tmxFilePath in tqdm(enumerate(filePathList)): tmxFilePath = b000path.desAnonymizePath(tmxFilePath) fileNotFound = False # get the list of lines try: with open(u'{0}.en'.format(tmxFilePath)) as enFile: enLines = enFile.readlines() with open(u'{0}.fr'.format(tmxFilePath)) as frFile: frLines = frFile.readlines() except FileNotFoundError: print(u'FILE NOT FOUND IN : {0}'.format(tmxFilePath)) fileNotFound = True if fileNotFound is False: # get each line for i in range(len(enLines)): srcLn, trgtLn, enLn, frLn = getLines( i, enLines, frLines, tmxFilePath) # apply the heuristics for heurName in heuristicsList: heurFolder = u'{0}{1}/'.format(out, heurName) # make the folder utilsOs.createEmptyFolder(heurFolder) # make the output files outputScorePath = u'{0}score.tsv'.format(heurFolder) # add the scores to the files with open(outputScorePath, u'a') as scoreFile: scoreFile.write( getLnToWrite( heurName, srcLn, trgtLn, enLn, frLn, placeInDocument=float(i) / float(len(enLines)), starbucksExprDict=starbucksExprDict, starbucksWordDict=starbucksWordDict)) # if the reference output already exists, don't write on it if referenceAlreadyExists is True: pass else: # write the ref line refFile.write(u'{0}\t{1}\n'.format( b000path.anonymizePath(tmxFilePath), i)) return None
def rewriteFileIfExists(path): # remove the file if it already exists if utilsOs.theFileExists(path) is True: with open(path, u'w') as file: file.write(u'')
def annotateFilesAfterHeurAndSelection(inputFolderPath, outputFolderPath, dumpSP=True): """ given a folder path, where the reference, en line and fr line are alreade selected, annotate the SPs """ # add a slash if needed if inputFolderPath[-1] != u'/': inputFolderPath = u'{0}/'.format(inputFolderPath) if outputFolderPath[-1] != u'/': outputFolderPath = u'{0}/'.format(outputFolderPath) # get the selected reference file lines with open(u'{0}sampleReference.Paths'.format( inputFolderPath)) as refPathsFile: referenceLines = refPathsFile.readlines() # get the en and fr input lines with open(u'{0}sample.en'.format(inputFolderPath)) as enFile: enLns = enFile.readlines() with open(u'{0}sample.fr'.format(inputFolderPath)) as frFile: frLns = frFile.readlines() with open(u'{0}scores.tsv'.format(inputFolderPath)) as scFile: scLns = scFile.readlines() # get rid of the files we have already annotated if utilsOs.theFileExists( u'{0}sampleReference.tsv'.format(outputFolderPath)): # get the already seen lines referencePathLine = utilsOs.readAllLinesFromFile( u'{0}sampleReference.tsv'.format(outputFolderPath), noNewLineChar=True) listOfAnnotations = utilsOs.readAllLinesFromFile( u'{0}sampleAnnotation.tsv'.format(outputFolderPath), noNewLineChar=True) # maintain only what we haven't seen annotatedFiles = set(referencePathLine) newRefLines = [] for ind, file in enumerate(referenceLines): if file.replace(u'\n', u'') not in annotatedFiles: newRefLines.append([ind, file.replace(u'\n', u'')]) referenceLines = newRefLines # print(referenceLines) else: referencePathLine = [] listOfAnnotations = [] referenceLines = [(ind, file.replace(u'\n', u'')) for ind, file in enumerate(referenceLines)] # print the annotator cheat sheet printCheatSheet() # open each file in EN and FR and show it in the terminal for tupleRef in referenceLines: indRef, refLn = tupleRef[0], tupleRef[1] print(u'############# {0} ##############'.format( refLn.replace(u'\n', u''))) # get the path for the source and target lnsSource = enLns if u'en-fr' in refLn else frLns lnsTarget = frLns if u'en-fr' in refLn else enLns # get the correct terminal line length lineLength = 137 - len(str(len(listOfAnnotations) + 1)) # color in red the during lines redDuringSource = u'\033[1;31m{0}\033[0m'.format(lnsSource[indRef]) # print the sentences print(u'{0} - {1}'.format(len(listOfAnnotations), redDuringSource)) print(u'{0} - {1}'.format(len(listOfAnnotations), lnsTarget[indRef])) print() # count the lines that take the space of 2 lines longLines = getNbLongLines([lnsSource[indRef], lnsTarget[indRef]], lineLength) # get the first part of the annotation (aligned or not) annotatorGeneralInput = input(u'Aligned-Misaligned annotation: ') # make sure to have the right general annotation while True: if annotatorGeneralInput in [ u'0', u'1', u'0.0', u'0.1', u'0.2', u'1.0', u'1.1', u'1.2', u'1.3', u'1.4', u'c', u'correction' ]: break else: utilsOs.moveUpAndLeftNLines(1, slowly=False) annotatorGeneralInput = input(u'Repeat annotation: ') if annotatorGeneralInput in [u'c', u'correct']: annotatorGeneralInput, listOfAnnotations = correctionToAnnotation( listOfAnnotations) # save to the list of annotations listOfAnnotations.append(float(annotatorGeneralInput)) # remove the lines from the terminal before getting to the next pair utilsOs.moveUpAndLeftNLines(7 + longLines, slowly=False) # erase all remainder of the previous sentences and go back up again for e in range(14 + longLines): print(u' ' * (lineLength + 4)) utilsOs.moveUpAndLeftNLines(7 + longLines, slowly=False) # append the reference to the file referencePathLine.append(refLn) # dump the file line by line, to be sure in case of error # dump the reference utilsOs.dumpRawLines( referencePathLine, u'{0}sampleReference.tsv'.format(outputFolderPath), addNewline=True, rewrite=True) # dump the annotation utilsOs.dumpRawLines( listOfAnnotations, u'{0}sampleAnnotation.tsv'.format(outputFolderPath), addNewline=True, rewrite=True) # dump the SP if dumpSP is True: enSent = lnsSource[indRef] if u'en-fr' in refLn else lnsTarget[ indRef] frSent = lnsTarget[indRef] if u'en-fr' in refLn else lnsSource[ indRef] utilsOs.appendLineToFile(enSent, u'{0}sample.en'.format(outputFolderPath), addNewLine=False) utilsOs.appendLineToFile(frSent, u'{0}sample.fr'.format(outputFolderPath), addNewLine=False) utilsOs.appendLineToFile(scLns[indRef], u'{0}scores.tsv'.format(outputFolderPath), addNewLine=False) # clear part of terminal utilsOs.moveUpAndLeftNLines(7, slowly=False)