示例#1
0
def main(inputFileName1, alignFileName1, inputFileName2, alignFileName2, mono1FileName, mono2FileName, \
        outputFileName, numClusInit, typeClusInit, fileLength, monoPower, biPower, edgeThresh1, edgeThresh2):
    
    # Read the input file and get word counts
    # 3 languages say: en, fr, de; de is the common in en-de and fr-de
    # 1: en 2:de 3: fr
    enWordDict = Counter()
    enBigramDict = Counter()
    deWordDict = Counter()
    deBigramDict = Counter()
    frWordDict = Counter()
    frBigramDict = Counter()
    
    alignDictEnDe, enWordDict, enBigramDict, deWordDict, deBigramDict \
    = readBilingualData(fileLength, inputFileName1, alignFileName1, mono1FileName, mono2FileName,\
                        enWordDict, enBigramDict, deWordDict, deBigramDict)
                        
    alignDictFrDe, frWordDict, frBigramDict, deWordDict, deBigramDict \
    = readBilingualData(fileLength, inputFileName2, alignFileName2, mono1FileName, mono2FileName,\
                        frWordDict, frBigramDict, deWordDict, deBigramDict)
    
    lang1, lang2, lang3, lang12, lang21, lang32, lang23 = initializeLanguagePairObjets(\
                alignDictEnDe, alignDictFrDe, enWordDict, enBigramDict, deWordDict, deBigramDict, frWordDict, frBigramDict,\
                numClusInit, typeClusInit, edgeThresh1, edgeThresh2)
                                           
    del alignDictEnDe, alignDictFrDe
    del enWordDict, enBigramDict, deWordDict, deBigramDict, frWordDict, frBigramDict
    
    # Run the clustering algorithm and get new clusters    
    runOchClustering(lang1, lang2, lang3, lang12, lang21, lang32, lang23, monoPower, biPower)
    
    # Print the clusters
    printClusters(outputFileName, lang1, lang2, lang3, None, None)
示例#2
0
def main(inputFileName1, alignFileName1, inputFileName2, alignFileName2, mono1FileName, mono2FileName, \
        outputFileName, numClusInit, typeClusInit, fileLength, monoPower, biPower, edgeThresh1, edgeThresh2):

    # Read the input file and get word counts
    # 3 languages say: en, fr, de; de is the common in en-de and fr-de
    # 1: en 2:de 3: fr
    enWordDict = Counter()
    enBigramDict = Counter()
    deWordDict = Counter()
    deBigramDict = Counter()
    frWordDict = Counter()
    frBigramDict = Counter()

    alignDictEnDe, enWordDict, enBigramDict, deWordDict, deBigramDict \
    = readBilingualData(fileLength, inputFileName1, alignFileName1, mono1FileName, mono2FileName,\
                        enWordDict, enBigramDict, deWordDict, deBigramDict)

    alignDictFrDe, frWordDict, frBigramDict, deWordDict, deBigramDict \
    = readBilingualData(fileLength, inputFileName2, alignFileName2, mono1FileName, mono2FileName,\
                        frWordDict, frBigramDict, deWordDict, deBigramDict)

    lang1, lang2, lang3, lang12, lang21, lang32, lang23 = initializeLanguagePairObjets(\
                alignDictEnDe, alignDictFrDe, enWordDict, enBigramDict, deWordDict, deBigramDict, frWordDict, frBigramDict,\
                numClusInit, typeClusInit, edgeThresh1, edgeThresh2)

    del alignDictEnDe, alignDictFrDe
    del enWordDict, enBigramDict, deWordDict, deBigramDict, frWordDict, frBigramDict

    # Run the clustering algorithm and get new clusters
    runOchClustering(lang1, lang2, lang3, lang12, lang21, lang32, lang23,
                     monoPower, biPower)

    # Print the clusters
    printClusters(outputFileName, lang1, lang2, lang3, None, None)
示例#3
0
def main(inputFileName, alignFileName, mono1FileName, mono2FileName, outputFileName, numClusInit, typeClusInit, fileLength, monoPower, biPower, edgeThresh):
    
    
    enWordDict = Counter()
    enBigramDict = Counter()
    frWordDict = Counter()
    frBigramDict = Counter()
    
    # Read the input file and get word counts
    alignDict, enWordDict, enBigramDict, frWordDict, frBigramDict \
    = readBilingualData(fileLength, inputFileName, alignFileName, mono1FileName, mono2FileName,\
                        enWordDict, enBigramDict, frWordDict, frBigramDict)
    
    lang1, lang2, lang12, lang21 = initializeLanguagePairObjets(alignDict, enWordDict, \
                                           enBigramDict, frWordDict, frBigramDict, numClusInit, typeClusInit, edgeThresh)
                                           
    del alignDict, enWordDict, enBigramDict, frWordDict, frBigramDict
    
    # Run the clustering algorithm and get new clusters    
    runOchClustering(lang1, lang2, lang12, lang21, monoPower, biPower)
    
    # Print the clusters
    printClusters(outputFileName, lang1, lang2, None, None, None)
示例#4
0
def main(
    inputFileName1,
    alignFileName1,
    inputFileName2,
    alignFileName2,
    inputFileName3,
    alignFileName3,
    inputFileName4,
    alignFileName4,
    mono1FileName,
    mono2FileName,
    outputFileName,
    numClusInit,
    typeClusInit,
    fileLength,
    monoPower,
    biPower,
    edgeThresh1,
    edgeThresh2,
    edgeThresh3,
    edgeThresh4,
):

    # Read the input file and get word counts
    # 3 languages say: en, fr, de; de is the common in en-de, fr-de anf fourth-de
    # 1: en 2:de 3:fr 4:fourth
    enWordDict = Counter()
    enBigramDict = Counter()
    deWordDict = Counter()
    deBigramDict = Counter()
    frWordDict = Counter()
    frBigramDict = Counter()
    fourthWordDict = Counter()
    fourthBigramDict = Counter()
    fifthWordDict = Counter()
    fifthBigramDict = Counter()

    alignDictEnDe, enWordDict, enBigramDict, deWordDict, deBigramDict = readBilingualData(
        fileLength,
        inputFileName1,
        alignFileName1,
        mono1FileName,
        mono2FileName,
        enWordDict,
        enBigramDict,
        deWordDict,
        deBigramDict,
    )

    alignDictFrDe, frWordDict, frBigramDict, deWordDict, deBigramDict = readBilingualData(
        fileLength,
        inputFileName2,
        alignFileName2,
        mono1FileName,
        mono2FileName,
        frWordDict,
        frBigramDict,
        deWordDict,
        deBigramDict,
    )

    alignDictFourthDe, fourthWordDict, fourthBigramDict, deWordDict, deBigramDict = readBilingualData(
        fileLength,
        inputFileName3,
        alignFileName3,
        mono1FileName,
        mono2FileName,
        frWordDict,
        frBigramDict,
        deWordDict,
        deBigramDict,
    )

    alignDictFifthDe, fifthWordDict, fifthBigramDict, deWordDict, deBigramDict = readBilingualData(
        fileLength,
        inputFileName4,
        alignFileName4,
        mono1FileName,
        mono2FileName,
        fifthWordDict,
        fifthBigramDict,
        deWordDict,
        deBigramDict,
    )

    lang1, lang2, lang3, lang4, lang5, lang12, lang21, lang32, lang23, lang42, lang24, lang52, lang25 = initializeLanguagePairObjets(
        alignDictEnDe,
        alignDictFrDe,
        alignDictFourthDe,
        alignDictFifthDe,
        enWordDict,
        enBigramDict,
        deWordDict,
        deBigramDict,
        frWordDict,
        frBigramDict,
        fourthWordDict,
        fourthBigramDict,
        fifthWordDict,
        fifthBigramDict,
        numClusInit,
        typeClusInit,
        edgeThresh1,
        edgeThresh2,
        edgeThresh3,
        edgeThresh4,
    )

    del alignDictEnDe, alignDictFrDe, alignDictFourthDe, alignDictFifthDe
    del enWordDict, enBigramDict, deWordDict, deBigramDict, frWordDict, frBigramDict, fourthWordDict, fourthBigramDict
    del fifthWordDict, fifthBigramDict
    # Run the clustering algorithm and get new clusters
    runOchClustering(
        lang1,
        lang2,
        lang3,
        lang4,
        lang5,
        lang12,
        lang21,
        lang32,
        lang23,
        lang42,
        lang24,
        lang52,
        lang25,
        monoPower,
        biPower,
    )

    # Print the clusters
    printClusters(outputFileName, lang1, lang2, lang3, lang4, lang5)