Exemplo n.º 1
0
def launchForACorpusGoogle(inPath, outPath=None, continueWhereWeLeftOf=True):
    outPath = outPath if outPath is not None else re.sub(r"[._]en", ".fr2en", re.sub(r"[._]en", ".en2fr", inPath))
    with open(inPath) as cn10k:
        if continueWhereWeLeftOf is not True:
            with open(outPath, "w") as out10k:
                out10k.write("")
                lastSeenInd = float("-inf")
        else:
            with open(outPath) as out10k:
                lastSeenInd = 0
                ouLn = out10k.readline()
                while ouLn:
                    lastSeenInd += 1
                    ouLn = out10k.readline()
        with open(outPath, "a") as out10k:
            session = webdriver.Firefox(executable_path=u"/u/alfonsda/progs/geckoDriver/geckodriver")
            session.get("https://translate.google.ca/")
            counter = 0
            cnLn = cn10k.readline()
            start = utilsOs.countTime()
            while cnLn:
                if counter >= lastSeenInd:
                    cnLn = cnLn.replace("\n", "")

                    session = chooseLangGoogleTrans(session)
                    translation, session = writeSrcGetTrgt(session, cnLn)
                    out10k.write("{0}\n".format(translation))
                    # take a coffee break if it's time
                    if utilsOs.countTime(start) >= 600:
                        session.close()
                        time.sleep(random.uniform(20, 60))
                        start = utilsOs.countTime()
                        # open the driver
                        try:
                            session = webdriver.Firefox(executable_path=u"/u/alfonsda/progs/geckoDriver/geckodriver")
                        except OSError:
                            time.sleep(600)
                            session = webdriver.Firefox(executable_path=u"/u/alfonsda/progs/geckoDriver/geckodriver")
                        session.get("https://translate.google.ca/")
                # next
                cnLn = cn10k.readline()
                counter += 1
    session.close()
Exemplo n.º 2
0
def launchForACorpusDeepL(inPath, outPath=None, continueWhereWeLeftOf=True):
    outPath = outPath if outPath is not None else re.sub(r"[._]en", ".fr2en", re.sub(r"[._]en", ".en2fr", inPath))
    with open(inPath) as cn10k:
        if continueWhereWeLeftOf is not True:
            with open(outPath, "w") as out10k:
                out10k.write("")
                lastSeenInd = float("-inf")
        else:
            with open(outPath) as out10k:
                lastSeenInd = 0
                ouLn = out10k.readline()
                while ouLn:
                    lastSeenInd += 1
                    ouLn = out10k.readline()
        with open(outPath, "a") as out10k:
            session = webdriver.Firefox(executable_path=u"/u/alfonsda/progs/geckoDriver/geckodriver")
            session.get("https://www.deepl.com/translator")
            counter = 0
            cnLn = cn10k.readline()
            start = utilsOs.countTime()
            while cnLn:
                if counter >= lastSeenInd:
                    cnLn = cnLn.replace("\n", "")
                    session, enFrTranslAndAlt, timeStampEn = translateOneLang(session, u"en", cnLn,
                                                                              len(cnLn.split(" ")), [])
                    out10k.write("{0}\n".format(enFrTranslAndAlt[0]))
                    # take a coffee break if it's time
                    if utilsOs.countTime(start) >= 600:
                        session.close()
                        time.sleep(random.uniform(20, 60))
                        start = utilsOs.countTime()
                        # open the driver
                        try:
                            session = webdriver.Firefox()
                        except OSError:
                            time.sleep(600)
                            session = webdriver.Firefox()
                        session.get("https://www.deepl.com/translator")
                # next
                cnLn = cn10k.readline()
                counter += 1
    session.close()
                dictCount[u"total"] += 1
                if prediction is None:
                    dictCount[u"silences"] += 1
                elif prediction == 0:
                    dictCount[u"zeros"] += 1
                elif prediction == 1:
                    dictCount[u"ones"] += 1
                # next line
                scLn = scFile.readline()
    print(dictCount)




# count the time the algorithm takes to run
startTime = utilsOs.countTime()

# extract the very problematic

# print("PROBLEMATIC - FLAGGED")
# extractVeryProblematic(folderPaths=[u'ALIGNMENT-QUALITY', u'MISALIGNED', u'QUALITY'])
# print("PROBLEMATIC - NOT-FLAGGED")
# extractVeryProblematic(folderPaths=[u'NOT-FLAGGED'])

# extract the not problematic at all

# print("NOT-PROBLEMATIC - FLAGGED")
# extractVeryNonProblematic(folderPaths=[u'ALIGNMENT-QUALITY', u'MISALIGNED', u'QUALITY'])
# print("NOT-PROBLEMATIC - NOT-FLAGGED")
# extractVeryNonProblematic(folderPaths=[u'NOT-FLAGGED'])
Exemplo n.º 4
0
def launchForOneDay(tokLimit=4000,
                    outputFolderPath=u"/data/rali5/Tmp/alfonsda/workRali/004tradBureau/017deepLTranslatedCorpus/",
                    coffeeBreak=1650):
    """
    launches the deepL bot for one day's worth
    :param tokLimit: maximum number of tokens to treat in the day
    :param outputFolderPath: path to the folder where will be output the files

    :param coffeeBreak: time in seconds when to take a break and start a new deppL session
    :return: tokCount: number of total tokens translated
    """
    start = utilsOs.countTime()
    # path to the referencer, indicating where we left off: path and last index worked
    referencerPath = u"/data/rali5/Tmp/alfonsda/workRali/004tradBureau/017deepLTranslatedCorpus/trRef"
    # info
    deepLUrl = u"https://www.deepl.com/translator"
    mUser, mPass, sUser, sPass = b000path.getDeepLProfileInfo()
    # for each user
    for user, passw in zip([sUser, mUser], [sPass, mPass]):
        tokCount = 0
        # open the driver
        session = webdriver.Firefox()
        session.get(deepLUrl)
        time.sleep(random.uniform(1.3, 3.1))
        # log to deepL
        session = authentificateBtUseSelenium(user, passw, session)
        # while we have not gone over the daily limit
        iterCount = 0
        while tokCount < (tokLimit-10):
            # get the sp
            sp, filePath, fileIndex, refLns = getANewSpWhereWeLeftOff(referencerPath)
            session, nbOfTok, enFrTranslAndAlt, frEnTranslAndAlt, timeEn, timeFr = translateSpGetResult(session, sp)
            # dump the referencer lines
            utilsOs.dumpRawLines(refLns, referencerPath, addNewline=False, rewrite=True)
            # dump original sp
            utilsOs.appendLineToFile(sp[0], u"{0}originalSent.en".format(outputFolderPath), addNewLine=True)
            utilsOs.appendLineToFile(sp[1], u"{0}originalSent.fr".format(outputFolderPath), addNewLine=True)
            # dump translation and variants
            utilsOs.appendLineToFile(enFrTranslAndAlt, u"{0}translated.en2fr".format(outputFolderPath), addNewLine=True)
            utilsOs.appendLineToFile(frEnTranslAndAlt, u"{0}translated.fr2en".format(outputFolderPath), addNewLine=True)
            # dump reference
            utilsOs.appendLineToFile(u"{0}\t{1}\n".format(filePath, fileIndex),
                                     u"{0}reference.tsv".format(outputFolderPath), addNewLine=False)
            # dump timestamp
            utilsOs.appendLineToFile(u"{0}\tlocal time: {1}".format(timeEn, transformTimeToLocalTime(timeEn)),
                                     u"{0}timestamp.en".format(outputFolderPath), addNewLine=True)
            utilsOs.appendLineToFile(u"{0}\tlocal time: {1}".format(timeFr, transformTimeToLocalTime(timeFr)),
                                     u"{0}timestamp.fr".format(outputFolderPath), addNewLine=True)
            # add number of tokens
            tokCount += nbOfTok
            # add nb of iterations
            iterCount += 1
            # take a coffee break if it's time
            if coffeeBreak is not None and utilsOs.countTime(start) >= coffeeBreak:
                session.close()
                time.sleep(random.uniform(60, 80))
                start = utilsOs.countTime()
                # open the driver
                session = webdriver.Firefox()
                session.get(deepLUrl)
                time.sleep(random.uniform(1.3, 3.1))
                # log to deepL
                session = authentificateBtUseSelenium(user, passw, session)
            time.sleep(random.uniform(1.0, 1.5))
        # close the driver
        session.close()
        time.sleep(random.uniform(10.0, 15.0))
    return tokCount, iterCount