def tableOfContents(stringSrc,
                    stringTrgt,
                    nTokens=4,
                    contextScores=None,
                    placeInDocument=None):
    """ given a string sentence pair return a score of the ratio
    of small sentence pairs in the context of the current sp """
    # change the place in the doc to obtain low metric in the beginning and end of doc and a high one at the middle
    placeInDocument = math.sqrt(placeInDocument - (placeInDocument**2)) * 2
    # if it's not already tokenized
    if type(stringSrc) is str and type(stringTrgt) is str:
        stringSrc, stringTrgt = stringSrc.lower(), stringTrgt.lower()
        addSeparators = [u'.', u',', u':', u'/', u'-', u"''", u"'"]
        stringSrc = utilsString.nltkTokenizer(stringSrc, addSeparators)
        stringTrgt = utilsString.nltkTokenizer(stringTrgt, addSeparators)
    scores = [tooFewTokens(stringSrc, stringTrgt, nTokens)]
    # re make the token list a string so we can check the first characters
    origSrcString = u' '.join(stringSrc)
    if len(origSrcString) > 4:
        # if there is a number or a symbol indicating a table of contents at the start of the string
        extractedNmbrs = utilsString.extractNumbersFromString(
            origSrcString[:3])
        if len(
                extractedNmbrs
        ) != 0 or u'-' in origSrcString[:3] or u'.' in origSrcString[:3]:
            scores.append(0)
        else:
            scores.append(1)
    # add the context to the current scores
    if contextScores is not None:
        scores = scores + contextScores
    # add the location of the sentence in the document to the current scores
    if placeInDocument is not None:
        scores = scores + [placeInDocument]
    return sum(scores) / len(scores)
def tooFewTokens(stringSrc, stringTrgt, nTokens=4):
    """ given a string sentence pair return 0 if there are less
    than N tokens on either the src or the trgt and return 1 otherwise """
    # if it's not already tokenized
    if type(stringSrc) is str and type(stringTrgt) is str:
        stringSrc, stringTrgt = stringSrc.lower(), stringTrgt.lower()
        addSeparators = [u'.', u',', u':', u'/', u'-', u"''", u"'"]
        stringSrc = utilsString.nltkTokenizer(stringSrc, addSeparators)
        stringTrgt = utilsString.nltkTokenizer(stringTrgt, addSeparators)
    # count the tokens
    if len(stringSrc) <= nTokens or len(stringTrgt) <= nTokens:
        return 0
    return 1
示例#3
0
def transformStringToGizaFormat(string, tokDict, lang, pathToFile):
    # get the language in the right format
    lang = getLang(lang, pathToFile)
    # tokenize
    # tokList = utilsString.naiveRegexTokenizer(string, language=lang,
    #                                           capturePunctuation=True, captureSymbols=True)
    tokList = utilsString.nltkTokenizer(string)
    # remake a string using the token ids instead of the actual tokens
    idString = []
    for tok in tokList:
        try:
            idString.append(tokDict[tok])
        except KeyError:
            try:
                idString.append(tokDict[u"{0}'".format(tok)])
            except KeyError:
                try:
                    idString.append(tokDict[u"'{0}".format(tok)])
                except KeyError:
                    try:
                        idString.append(tokDict[tok.replace(u"'", u"")])
                    except KeyError:
                        print(tok, "ERROR with token: ", repr(tok), type(tok))
    idString = [str(id) for id in idString]
    return u" ".join(idString)
def cognateCoincidence(stringSrc, stringTrgt, cognateSize=4):
    """ given a string sentence pair return the ratio of coincidence
     between the cognates (start of word char ngram) between source and target"""
    # if it's not already tokenized
    if type(stringSrc) is str and type(stringTrgt) is str:
        stringSrc, stringTrgt = stringSrc.lower(), stringTrgt.lower()
        addSeparators = [u'.', u',', u':', u'/', u'-', u"''", u"'"]
        stringSrc = utilsString.nltkTokenizer(stringSrc, addSeparators)
        stringTrgt = utilsString.nltkTokenizer(stringTrgt, addSeparators)
    # sort by decreasing length of the original word
    stringSrc.sort(key=lambda tok: len(tok), reverse=True)
    stringTrgt.sort(key=lambda tok: len(tok), reverse=True)
    # compile the cognates of each token for the source and target
    srcCognates = getCognates(stringSrc, cognateSize)
    trgtCognates = set(getCognates(stringTrgt, cognateSize))
    # get intersection of cognates
    intersection = [cog for cog in srcCognates if cog in trgtCognates]
    smallerLength = min(len(srcCognates), len(trgtCognates))
    if smallerLength == 0:
        return 0
    return len(intersection) / smallerLength
示例#5
0
def makeFreqDict(pathToFile, lang=None):
    freqTokDict = {}
    with open(pathToFile) as outFile:
        # first line
        outLn = outFile.readline()
        while outLn:
            outLn = outLn.replace(u"\n", u"")
            # get the language in the right format
            lang = getLang(lang, pathToFile)
            # tokenize
            # outToks = utilsString.naiveRegexTokenizer(outLn, language=lang,
            #                                           capturePunctuation=True, captureSymbols=True)
            outToks = utilsString.nltkTokenizer(outLn)
            # add to the token freq dict
            for tok in outToks:
                if tok not in freqTokDict:
                    freqTokDict[tok] = 0
                freqTokDict[tok] += 1
            # next line
            outLn = outFile.readline()
    return freqTokDict
示例#6
0
            if len(utilsString.extractNumbersFromString(
                    srcLn[:3])) != 0 or u'-' in srcLn[:3] or u'.' in srcLn[:3]:
                if docLoc < 0.2:
                    tablMat['dashOrNb']['0.0-0.19'] += 1
                elif docLoc < 0.4:
                    tablMat['dashOrNb']['0.2-0.39'] += 1
                elif docLoc < 0.6:
                    tablMat['dashOrNb']['0.4-0.59'] += 1
                elif docLoc < 0.8:
                    tablMat['dashOrNb']['0.6-0.79'] += 1
                else:
                    tablMat['dashOrNb']['0.8-1.0'] += 1
                if refLines[srcLnIndex] != u'1.0\n':
                    count[u'small'] += 1

            srcLn = utilsString.nltkTokenizer(srcLn, addSeparators)
            trgtLn = utilsString.nltkTokenizer(trgtLn, addSeparators)

            # # compile the cognates of each token for the source and target
            # srcCognates = b003heuristics.getCognates(srcLn, 4)
            # trgtCognates = set(b003heuristics.getCognates(trgtLn, 4))
            # # get intersection of cognates
            # intersection = [cog for cog in srcCognates if cog in trgtCognates]
            # lenin = len(intersection)
            # if lenin not in cogn:
            #     cogn[lenin] = 0
            # cogn[lenin] += 1

            # sizeSrc = len(srcLn)
            # sizeTrgt = len(trgtLn)
            # if abs(sizeSrc-sizeTrgt) not in lenT:
def nbMismatch(stringSrc,
               stringTrgt,
               includeNumberNames=True,
               useEditDistance=True):
    """ given a string sentence pair, returns a score indicating how much a the
    numbers in the source appear in the target """
    # if it's not already tokenized
    if type(stringSrc) is str and type(stringTrgt) is str:
        stringSrc, stringTrgt = stringSrc.lower().replace(
            u' pm', u'pm'), stringTrgt.lower().replace(u' pm', u'pm')
        addSeparators = [u'.', u',', u':', u'/', u'-', u'h', u"''", u"'"]
        stringSrc = utilsString.nltkTokenizer(stringSrc, addSeparators)
        stringTrgt = utilsString.nltkTokenizer(stringTrgt, addSeparators)
    # transform all number names in actual numbers
    if includeNumberNames is True:
        stringSrcList = utilsString.transformNbNameToNb(stringSrc)
        stringTrgtList = utilsString.transformNbNameToNb(stringTrgt)
    # get the tokens containing a digit
    nbrs = re.compile(r'[0-9]')
    stringSrcList = [
        tok for tok in stringSrc if len(re.findall(nbrs, tok)) != 0
    ]
    stringTrgtList = [
        tok for tok in stringTrgt if len(re.findall(nbrs, tok)) != 0
    ]
    # if there were no numbers, return the max score (we can't use this heuristic to evaluate)
    if len(stringSrcList) + len(stringTrgtList) == 0:
        return 1.0
    # if we want to search for the exact same numbers
    if useEditDistance == False:
        # extract the figures from the tokens
        numbersInSrc = set(getNbsAlone(stringSrcList))
        numbersInTrgt = set(getNbsAlone(stringTrgtList))
        # if there were no numbers, return the max score (we can't use this heuristic to evaluate)
        if len(numbersInSrc) + len(numbersInTrgt) == 0:
            return 1.0
        # calculate the score of src-trgt coincidence
        nbIntersection = numbersInSrc.intersection(numbersInTrgt)
        print(
            1000,
            len(nbIntersection) /
            ((len(stringSrcList) + len(stringTrgtList)) / 2), nbIntersection)
        return len(nbIntersection) / (
            (len(numbersInSrc) + len(numbersInTrgt)) / 2)
    # if we want to use the edit distance to match the source digit tokens with the target ones
    else:
        nbIntersection = []
        # sort the digitfull src token list by decreasing length
        stringSrcList.sort(key=lambda tok: len(tok), reverse=True)
        # make a copy of the target list
        trgtList = stringTrgtList.copy()
        for srcTok in stringSrcList:
            # find the most similar trgt token
            mostSimil = [None, None, None, 1]
            for trgtInd, trgtTok in enumerate(trgtList):
                editDistScore = utilsString.getNormalizedEditDist(
                    srcTok, trgtTok)
                # get the less distant in the trgt tokens
                if editDistScore < 0.5 and editDistScore < mostSimil[-1]:
                    mostSimil = [srcTok, trgtTok, trgtInd, editDistScore]
            # remove the most similar from the trgt list
            if mostSimil[0] is not None:
                del trgtList[mostSimil[-2]]
                nbIntersection.append(tuple(mostSimil[:2]))
        return len(nbIntersection) / (
            (len(stringSrcList) + len(stringTrgtList)) / 2)
def extractMisalignedSP(pathToSrcTrgtFiles,
                        extractionSize=100,
                        typeOfExtractors=[0, 1, 2]):
    """ given a path to the original source and target files, and the types of
    extractors to be used returns SP (sentence pairs) extracted as misaligned
    extractor types:
    - 0 : same number presence in src and trgt
    - 1 : 4 or less than 4 tokens
    - 2 : """
    extractedSp = {0: {}, 1: {}, 2: {}}
    totalLines = 0

    # get name of subset
    for subset in [
            u'/ALIGNMENT-QUALITY', u'/MISALIGNED', u'/NOT-FLAGGED', u'/QUALITY'
    ]:
        if subset in pathToSrcTrgtFiles:
            subsetName = subset
    # type 1 block
    output1Path = u'./003negativeNaiveExtractors/numberCoincidence/'
    utilsOs.createEmptyFolder(output1Path)
    # type 2 block
    output1Path = u'./003negativeNaiveExtractors/fewTokens/'
    utilsOs.createEmptyFolder(output1Path)
    # type 3 block
    output2Path = u'./003negativeNaiveExtractors/cognates/'
    utilsOs.createEmptyFolder(output2Path)
    # get the path to the src and trgt files
    srcTrgtFiles = utilsOs.goDeepGetFiles(pathToSrcTrgtFiles, format=u'.tmx')
    print(u'TOTAL FILES : ', len(srcTrgtFiles))
    for filePath in srcTrgtFiles:
        srcFilePath = u'{0}.en'.format(
            filePath) if u'en-fr' in filePath else u'{0}.fr'.format(filePath)
        trgtFilePath = u'{0}.fr'.format(
            filePath) if u'en-fr' in filePath else u'{0}.en'.format(filePath)
        # open line by line and apply extractors
        try:
            with open(srcFilePath) as srcFile:
                with open(trgtFilePath) as trgtFile:
                    srcLines = srcFile.readlines()
                    trgtLines = trgtFile.readlines()
                    for srcLnIndex, srcLn in enumerate(srcLines):
                        trgtLn = trgtLines[srcLnIndex]
                        # tokenize
                        srcLn = srcLn.lower().replace(u' pm', u'pm')
                        trgtLn = trgtLn.lower().replace(u' pm', u'pm')
                        addSeparators = [
                            u'.', u',', u':', u'/', u'-', u"''", u"'"
                        ]
                        srcTokens = utilsString.nltkTokenizer(
                            srcLn, addSeparators)
                        trgtTokens = utilsString.nltkTokenizer(
                            trgtLn, addSeparators)
                        # apply the extractors
                        if 0 in typeOfExtractors:
                            extractedSp, score = applyExtractor(
                                nbMismatch, 0.75, srcTokens, trgtTokens,
                                extractedSp, filePath, 0, int(srcLnIndex))
                        if 1 in typeOfExtractors:
                            # get context scores and location in doc
                            cntxtScores = getContextScores(
                                srcLnIndex, srcLines, trgtLines)
                            docLoc = srcLnIndex / len(srcLines)
                            extractedSp, score = applyExtractor(
                                tableOfContents,
                                0.32,
                                srcTokens,
                                trgtTokens,
                                extractedSp,
                                filePath,
                                1,
                                int(srcLnIndex),
                                contextScores=cntxtScores,
                                placeInDocument=docLoc)
                        if 2 in typeOfExtractors:
                            extractedSp, score = applyExtractor(
                                cognateCoincidence, 0.1, srcTokens, trgtTokens,
                                extractedSp, filePath, 2, int(srcLnIndex))
                    totalLines += len(srcLines)
        # some folders have no .en and .fr to each .tmx file
        # (e.g.: '/data/rali8/Tmp/rali/bt/burtrad/corpus_renamed/MISALIGNED/241-CAN_CENT_OCC_HEALTH/SAFE/en-fr/')
        except FileNotFoundError:
            pass
    print(u'TOTAL LINES : ', totalLines)
    # dump the extracted sp dict into a json file
    utilsOs.dumpDictToJsonFile(
        extractedSp,
        pathOutputFile=u'./003negativeNaiveExtractors/000extractedSp.json',
        overwrite=True)
    # randomly extract and dump the file path and the line index for the extracted SP
    randomlyExtractAndDump(extractedSp, 100, subsetName)