def no_test_tonto(self):
		
		taggroups=self.taggroups
		
		bilphrasesSet=ruleLearningLib.AlignmentTemplateSet(taggroups)
		originalATList=list()
		
		numAt=0
		print >> sys.stderr, "Reading ALignment Templates/ Bilingual Phrases...."
		for line in sys.stdin:
			numAt+=1
			
			line=line.decode('utf-8').strip()
			at = ruleLearningLib.AlignmentTemplate()
			
			piecesOfline=line.split(u'|')
			textat=u'|'.join(piecesOfline[1:5])
			freq=piecesOfline[0].strip()
			
			sllemmastext=piecesOfline[5].strip()
			tllemmastext=piecesOfline[6].strip()
			sllemmas=sllemmastext.split(u'\t')
			tllemmas=tllemmastext.split(u'\t')
			
			at.parse(textat)
			
			
			at.freq=int(freq)
							
			tl_lemmas_from_dictionary_text=piecesOfline[7].strip()
			tl_lemmas_from_dictionary_list=tl_lemmas_from_dictionary_text.split(u'\t')
			
			bilphrase=copy.deepcopy(at)
			bilphrase.tl_lemmas_from_dictionary=tl_lemmas_from_dictionary_list
			bilphrase.lexicalise_all(sllemmas,tllemmas)
			bilphrase.id=numAt
			bilphrasesSet.add(bilphrase)
			
			originalATList.append((at,sllemmas,tllemmas,tl_lemmas_from_dictionary_list))
			
			#print bilphrase.tl_lemmas_from_dictionary
			
		print >> sys.stderr, " ....."+str(len(originalATList))+" items."
		solution=generaliseATs.generalise_by_linear_program(bilphrasesSet,originalATList,taggroups)
		
		for at in solution:
			print at
    if uponedir_files:
        parts_slash = intermediate_input_file_only_number.split("/")
        parts_slash = parts_slash[:-2] + ["ats"] + parts_slash[-1:]
        intermediate_input_file_only_number_aux = "/".join(parts_slash)
    elif read_generalised_bilphrases_from_dir:
        parts_slash = intermediate_input_file_only_number.split("/")
        intermediate_input_file_only_number_aux = read_generalised_bilphrases_from_dir + "/" + parts_slash[
            -1]

    gzipSuffix = ""
    if gzip_files:
        gzipSuffix = ".gz"

    #Read ATs
    finalAlignmentTemplates = ruleLearningLib.AlignmentTemplateSet()
    atsFileName = intermediate_input_file + ".ats" + gzipSuffix
    if gzip_files:
        file = gzip.open(atsFileName, 'r')
    else:
        file = open(atsFileName, 'r')
    id = 0

    atsByFreq = dict()

    for line in file:
        line = line.strip().decode('utf-8')
        parts = line.split(u" | ")
        at = ruleLearningLib.AlignmentTemplate()
        at.parse(u" | ".join(parts[1:]))
        at.freq = int(parts[0])
示例#3
0
def process_bilingual_phrases(atListWithLemmasList, bilingualPhrases,
                              generalisationOptions, generationMethod,
                              allowedSLLemmas):
    finalAlignmentTemplates = ruleLearningLib.AlignmentTemplateSet()
    idAt = 1

    structuralVariationsDictionary = dict()
    lexicalVariationsDictionary = dict()
    afterwardsDictionary = dict()

    timeStructuralvariations = 0.0
    timeLexicalVariations = 0.0
    timeRemovingWrongAlignments = 0.0
    timeCorrectAndIncorrect = 0.0
    timeAfterwardsRestrictions = 0.0

    for atWithLemmas in atListWithLemmasList:
        at = atWithLemmas[0]
        sllemmas = atWithLemmas[1]
        tllemmas = atWithLemmas[2]
        tllemmasfromdictionary = atWithLemmas[3]

        debug("Generalising " + str(at) + " | " + str(sllemmas) + " | " +
              str(tllemmas))

        if generationMethod == AlignmentTemplateGenerationMethod.FIRST_APPROACH:
            subsetsGraph = ruleLearningLib.SubsetGraph()
            idAt = ruleLearningLib.AlignmentTemplate_generate_all_generalisations_and_add_them(
                at, sllemmas, tllemmas, tllemmasfromdictionary,
                finalAlignmentTemplates, idAt, subsetsGraph, True, True,
                generalisationOptions.get_genWhenEmptyTLCats(),
                generalisationOptions.get_genWhenEmptySLCats())
        elif generationMethod == AlignmentTemplateGenerationMethod.TL_VARIABLES:
            debug("Checking whether hash '" + str(hash(at)) +
                  "' is in the dictionary |d| = " +
                  str(len(structuralVariationsDictionary)) + ".")
            #wildcard and reference values
            if not at in structuralVariationsDictionary:
                debug("AT not found in structural generalisations")
                starttime = time()
                structuralVariationsAts = ruleLearningLib.AlignmentTemplate_generate_all_structural_generalisations(
                    at, generalisationOptions)
                timeStructuralvariations += (time() - starttime)
                structuralVariationsDictionary[at] = structuralVariationsAts
            else:
                debug(
                    "AT already found in structural generalisations. Not repeating work"
                )

            lemmasposandalignments = at.fast_clone()
            lemmasposandalignments.remove_all_inflection_tags()
            cleanAT = lemmasposandalignments.fast_clone()
            lemmasposandalignments.set_lemmas(sllemmas, tllemmas)
            lemmasposandalignments.tl_lemmas_from_dictionary = tllemmasfromdictionary

            #lexicalisations
            if not lemmasposandalignments in lexicalVariationsDictionary:
                starttime = time()
                lexicalVariationsAtsF = ruleLearningLib.AlignmentTemplate_generate_all_lexical_generalisations(
                    cleanAT, sllemmas, tllemmas, tllemmasfromdictionary,
                    generalisationOptions.is_unlexicaliseUnalignedSL())
                if allowedSLLemmas:
                    lexicalVariationsAts = [
                        myat for myat in lexicalVariationsAtsF
                        if tuple(myat.get_sl_lemmas()) in allowedSLLemmas
                    ]
                else:
                    lexicalVariationsAts = lexicalVariationsAtsF

                timeLexicalVariations += (time() - starttime)
                lexicalVariationsDictionary[
                    lemmasposandalignments] = lexicalVariationsAts

            #removing alignments
            starttime = time()
            for atstruct in structuralVariationsDictionary[at]:
                for atlex in lexicalVariationsDictionary[
                        lemmasposandalignments]:
                    newat = atstruct.fast_clone()
                    newat.set_lemmas_from_other_at(atlex)
                    options = newat.get_unalignment_options_for_multiple_aligned_unlexicalised_tl_words(
                        lemmasposandalignments)
                    for option in options:
                        atcopy = newat.fast_clone()
                        atcopy.remove_alignments(option)
                        atcopy.alignments.sort()
                        debug("Obtained AT: " + str(atcopy))

                        if not atcopy in afterwardsDictionary:
                            afterwardsDictionary[atcopy] = list()
                        afterwardsDictionary[atcopy].append(
                            atcopy.afterwards_restrictions)

                        if not finalAlignmentTemplates.is_in_set(atcopy):
                            debug("is NOT in set")
                            idAt += 1
                            atcopy.id = idAt
                            finalAlignmentTemplates.add(atcopy)
            timeRemovingWrongAlignments += (time() - starttime)
        else:
            print >> sys.stderr, "WRONG GENERATION METHOD"

    idAT = len(finalAlignmentTemplates.get_all_ats_list())
    finalAlignmentTemplatesAfterwardsRestrictions = AlignmentTemplateSet()

    if ruleLearningLib.DEBUG:
        debug("All the bilingual phrases:")
        for bilphrase in bilingualPhrases.get_all_ats_list():
            debug("\t" + str(bilphrase))
            tllemmaslocal = u" ".join([
                "'" + lem + "'" for lem in bilphrase.tl_lemmas_from_dictionary
            ])
            debug("TL lemmas: " + tllemmaslocal.encode('utf-8'))

    matchingBilphrasesDict = dict()
    for at in finalAlignmentTemplates.get_all_ats_list():
        starttime = time()
        idsOk, idMatching, numOk, numMatching = bilingualPhrases.get_ids_of_matching_and_compatible_phrases(
            at)
        timeCorrectAndIncorrect += (time() - starttime)
        matchingBilphrasesDict[at] = (idsOk, idMatching, numOk, numMatching)
        at.freq = numOk
        debug("precomputing matching and OK bilingual phrases for at: " +
              str(at))
        debug("numOK: " + str(numOk) + " numMatching: " + str(numMatching))

    debug("Final ATs:")
    for at in finalAlignmentTemplates.get_all_ats_list():
        if generalisationOptions.is_refToBiling(
        ) and not generalisationOptions.is_differentRestrictionOptions(
        ) and generalisationOptions.is_generalise(
        ) and not generalisationOptions.is_addRestrictionsForEveryTag():
            at.shorten_restrictions()

        idsOk, idMatching, numOk, numMatching = matchingBilphrasesDict[at]
        debug(str(at))
        debug("with numOK = " + str(numOk) + " and freq = " + str(at.freq))

        if generalisationOptions.get_possibleValuesForRestrictions(
        ) == AT_GeneralisationOptions.VALUE_FOR_RESTRICTION_TRIGGERINGCHANGE:
            starttime = time()

            atsSharingLeftSide = list()
            for atSharing in finalAlignmentTemplates.get_ats_with_same_sllex_and_restrictions(
                    at):
                if atSharing != at:
                    reproducedBilphrasesOfSharing = AlignmentTemplateSet()
                    incorrectBilphrasesOfSharing = AlignmentTemplateSet()
                    idsOkS, idMatchingS, numOkS, numMatchingS = matchingBilphrasesDict[
                        atSharing]
                    incorrectIds = set(idMatchingS) - set(idsOkS)
                    for incorrectId in incorrectIds:
                        incorrectBilphrasesOfSharing.add(
                            bilingualPhrases.get_by_id(incorrectId))
                    for idOK in idsOkS:
                        reproducedBilphrasesOfSharing.add(
                            bilingualPhrases.get_by_id(idOK))
                    atsSharingLeftSide.append(
                        (atSharing, reproducedBilphrasesOfSharing,
                         incorrectBilphrasesOfSharing, numOkS))

            incorrectBilphrases = AlignmentTemplateSet()
            incorrectIds = set(idMatching) - set(idsOk)
            for incorrectId in incorrectIds:
                incorrectBilphrases.add(
                    bilingualPhrases.get_by_id(incorrectId))

            reproducedBilphrases = AlignmentTemplateSet()
            for idOK in idsOk:
                reproducedBilphrases.add(bilingualPhrases.get_by_id(idOK))

            debug("Processing AT to add restrictions: " + str(at))
            debug("Matching bilphrases (" + str(len(idMatching)) + "):")
            if ruleLearningLib.DEBUG:
                for bid in idMatching:
                    debug("\t" + str(bilingualPhrases.get_by_id(bid)))
            debug("Reproduced bilphrases (" + str(len(idsOk)) + "):")
            if ruleLearningLib.DEBUG:
                for bid in idsOk:
                    debug("\t" + str(bilingualPhrases.get_by_id(bid)))
            debug("Incorrect bilphrases (" + str(len(incorrectIds)) + ") :")
            if ruleLearningLib.DEBUG:
                for inat in incorrectBilphrases.get_all_ats_list():
                    debug("\t" + str(inat.id) + ": " + inat.to_string(
                        removeRestrictionsFromLexicalised=False))

            #represent possible restrictions to be added as tuples
            allOptions = list()
            afterwardsRestrictionItemIndex = 0
            for afterwards_restriction_item in afterwardsDictionary[at]:
                afterwardsRestrictionItemIndex += 1
                restrictionsAsTuples = list()
                for i in range(len(afterwards_restriction_item)):
                    #only add restrictions for non-lexicalised words
                    if not at.parsed_sl_lexforms[i].has_lemma():
                        afterwardDict = afterwards_restriction_item[i]
                        for key in afterwardDict:
                            tuplerep = (i, key, afterwardDict[key])
                            restrictionsAsTuples.append(tuplerep)

                debug("Possible values for restrictions " +
                      str(afterwardsRestrictionItemIndex) + ": " +
                      str(restrictionsAsTuples))

                #compute power set
                options = powerset(restrictionsAsTuples)
                allOptions.extend(options)

            allOptionsFrozenUniq = list(set([frozenset(o)
                                             for o in allOptions]))

            #sort options by number of components
            sortedOptions = sorted(allOptionsFrozenUniq, key=len)
            if generalisationOptions.is_triggeringLimitedLength():
                positionOfFirstInvalidOption = None
                for k in range(len(sortedOptions)):
                    if len(sortedOptions[k]) > len(at.parsed_sl_lexforms):
                        positionOfFirstInvalidOption = k
                        break
                if positionOfFirstInvalidOption != None:
                    sortedOptions = sortedOptions[:
                                                  positionOfFirstInvalidOption]

            incorrectIdsNotMatchingDict = dict()

            while len(sortedOptions) > 0:
                opt = sortedOptions.pop(0)
                optlen = len(opt)
                debug("Added restrictions option: " + str(opt))

                #matchesZero=False
                #for resSetMatchingZero in restrictionsSetsMatchingZero:
                #    if opt <= resSetMatchingZero:
                #        matchesZero=True
                #        break
                #if matchesZero:
                #    break

                newAT = at.fast_clone()
                newAT.add_restrictions_from_tuples(opt)

                idsOk, idMatching, numOk, numMatching = incorrectBilphrases.get_ids_of_matching_and_compatible_phrases(
                    newAT)
                incorrectIdsNotMatching = frozenset(incorrectIds - idMatching)

                idsOKFromReproducible, idsMatchingFromReproducible, numOkFromRepr, numMatchingFromRepr = reproducedBilphrases.get_ids_of_matching_and_compatible_phrases(
                    newAT)
                totalReproduciblePhrases = len(
                    reproducedBilphrases.get_all_ids())
                numReproduciblePhrasesNowNOtMatching = totalReproduciblePhrases - len(
                    idsOKFromReproducible)
                debug("Reproducible phrases which now don't match: " +
                      str(numReproduciblePhrasesNowNOtMatching))

                atLeastOneValid = False
                if generalisationOptions.is_discardRestrictionsNotImproving():
                    for atSharing, reproducedSharing, incorrectSharing, numOkofSharing in atsSharingLeftSide:
                        idsOkS, idMatchingS, numOkS, numMatchingS = incorrectSharing.get_ids_of_matching_and_compatible_phrases(
                            newAT)
                        idsOKFromReproducibleS, idsMatchingFromReproducibleS, numOkFromReprS, numMatchingFromReprS = reproducedSharing.get_ids_of_matching_and_compatible_phrases(
                            newAT)
                        if ruleLearningLib.DEBUG:
                            debug("\tAT sharing left side: " + str(atSharing))
                            debug("\t New AT matches " + str(numMatchingS) +
                                  " bilphrases out of " +
                                  str(incorrectSharing.get_total_freq()) +
                                  " incorrect bilphrases")
                            debug("\t  reproduces " + str(numOkS) + "/" +
                                  str(numMatchingS))
                            debug("\t New AT matches " +
                                  str(numMatchingFromReprS) +
                                  " bilphrases out of " +
                                  str(reproducedSharing.get_total_freq()) +
                                  " reproduced bilphrases")
                            debug("\t  reproduces " + str(numOkFromReprS) +
                                  "/" + str(numMatchingFromReprS))
                        phrasesCorrectlyReproducedByCombo = set()

                        #first, the bilingual phrases correctly reproduced by atSharing minus the bilingual phrases matched by newAT
                        phrasesCorrectlyReproducedByCombo.update(
                            reproducedSharing.get_all_ids())
                        phrasesCorrectlyReproducedByCombo.difference_update(
                            idMatchingS)
                        phrasesCorrectlyReproducedByCombo.difference_update(
                            idsMatchingFromReproducibleS)

                        #in addition, the bilingual phrases correctly reproduced by 'newAT' which were matched by AtSharing
                        phrasesCorrectlyReproducedByCombo.update(idsOkS)
                        phrasesCorrectlyReproducedByCombo.update(
                            idsOKFromReproducibleS)

                        totalFreqOfPhrasesReproducedByCombo = sum(
                            bilingualPhrases.get_by_id(bid).freq
                            for bid in phrasesCorrectlyReproducedByCombo)
                        totalFreqOfPhrasesReproducedBySharingAT = numOkofSharing
                        debug("\t" + str(totalFreqOfPhrasesReproducedByCombo) +
                              " phrases reproduced by combo vs. " +
                              str(totalFreqOfPhrasesReproducedBySharingAT) +
                              "phrases reproduced by AT sharing left side")
                        debug("\t" + str(numOkFromRepr) +
                              " phrases reproduced by newAT vs. " +
                              str(totalFreqOfPhrasesReproducedBySharingAT) +
                              "phrases reproduced by AT sharing left side")
                        if numOkFromRepr < totalFreqOfPhrasesReproducedBySharingAT and totalFreqOfPhrasesReproducedByCombo > totalFreqOfPhrasesReproducedBySharingAT and numOkS > numMatchingS / 2:
                            debug("\tRestriction VALID for this shared AT")
                            atLeastOneValid = True
                        else:
                            debug("\tRestriction NOT valid for this shared AT")

                if (numReproduciblePhrasesNowNOtMatching == 0 or
                        not generalisationOptions.is_triggeringNoGoodDiscarded(
                        )) and (not generalisationOptions.
                                is_discardRestrictionsNotImproving()
                                or atLeastOneValid or optlen == 0):
                    if ruleLearningLib.DEBUG:
                        debug("Incorrect bilphrases which now don't match (" +
                              str(len(incorrectIdsNotMatching)) + "):")
                        for bid in incorrectIdsNotMatching:
                            debug("\t" + str(bilingualPhrases.get_by_id(bid)))

                    if len(incorrectIdsNotMatching) > 0:
                        validAT = True
                        if incorrectIdsNotMatching in incorrectIdsNotMatchingDict:
                            debug(
                                "The same set of bilingual phrases was removed by other sets of restrictions..."
                            )
                            for pastoption in incorrectIdsNotMatchingDict[
                                    incorrectIdsNotMatching]:
                                if pastoption <= opt:
                                    debug(
                                        "... and there is a subset of this one: "
                                        + str(pastoption))
                                    validAT = False
                                    break
                            if validAT:
                                debug("... but no set is a subset of this one")
                        else:
                            debug(
                                "The same set of bilingual phrases was NOT removed by other sets of restrictions."
                            )
                            incorrectIdsNotMatchingDict[
                                incorrectIdsNotMatching] = set()
                            incorrectIdsNotMatchingDict[
                                incorrectIdsNotMatching].add(opt)
                        if validAT:
                            debug("SET OF RESTRICTIONS OK")
                            idAt += 1
                            newAT.id = idAt
                            finalAlignmentTemplatesAfterwardsRestrictions.add(
                                newAT)
                    if len(idMatching) == 0:
                        debug(
                            "This AT does not match any incorrect bilingual phrase. Removing all its supersets"
                        )
                        #restrictionsSetsMatchingZero.add(opt)
                        sortedOptionsCopy = list()
                        for sopt in sortedOptions:
                            if not opt <= sopt:
                                sortedOptionsCopy.append(sopt)
                        sortedOptions = sortedOptionsCopy
                else:
                    debug("Set of restrictions not generated")
                debug("")

            timeAfterwardsRestrictions += (time() - starttime)

    debug("Final ATs with afterwards restrictions:")
    for at in finalAlignmentTemplatesAfterwardsRestrictions.get_all_ats_list():
        starttime = time()
        idsOk, idMatching, numOk, numMatching = bilingualPhrases.get_ids_of_matching_and_compatible_phrases(
            at)
        timeCorrectAndIncorrect += (time() - starttime)
        at.freq = numOk
        debug(str(at))

    finalAlignmentTemplates.write(sys.stdout)
    finalAlignmentTemplatesAfterwardsRestrictions.write(sys.stdout)

    print >> sys.stderr, "Time performing structural generalisation: " + str(
        timeStructuralvariations)
    print >> sys.stderr, "Time performing lexical generalisation: " + str(
        timeLexicalVariations)
    print >> sys.stderr, "Time removing wrong alignments: " + str(
        timeRemovingWrongAlignments)
    print >> sys.stderr, "Time computing correct and matching ATs: " + str(
        timeCorrectAndIncorrect)
    print >> sys.stderr, "Time generating afterwards restrictions: " + str(
        timeAfterwardsRestrictions)
示例#4
0
        if gzip_files:
            file = gzip.open(args.ats_with_allowed_lemmas_file, 'r')
        else:
            file = open(args.ats_with_allowed_lemmas_file, 'r')
        for line in file:
            line = line.strip().decode('utf-8')
            at = ruleLearningLib.AlignmentTemplate()
            at.parse(line)
            allowedSLLemmas.add(tuple(at.get_sl_lemmas()))
        file.close()

    #Read bilingual phrases
    originalATList = list()

    #store bilingual phrases in an efficient way
    bilingualPhrases = ruleLearningLib.AlignmentTemplateSet()

    print >> sys.stderr, "Reading Bilingual Phrases...."
    bilid = 0
    for line in inputFile:

        line = line.decode('utf-8')
        at = ruleLearningLib.AlignmentTemplate()
        bilphrase = ruleLearningLib.AlignmentTemplate()

        piecesOfline = line.split(u'|')
        textat = u'|'.join(piecesOfline[1:5])
        freq = piecesOfline[0].strip()

        sllemmastext = piecesOfline[5].strip()
        tllemmastext = piecesOfline[6].strip()