def no_test_tonto(self): taggroups=self.taggroups bilphrasesSet=ruleLearningLib.AlignmentTemplateSet(taggroups) originalATList=list() numAt=0 print >> sys.stderr, "Reading ALignment Templates/ Bilingual Phrases...." for line in sys.stdin: numAt+=1 line=line.decode('utf-8').strip() at = ruleLearningLib.AlignmentTemplate() piecesOfline=line.split(u'|') textat=u'|'.join(piecesOfline[1:5]) freq=piecesOfline[0].strip() sllemmastext=piecesOfline[5].strip() tllemmastext=piecesOfline[6].strip() sllemmas=sllemmastext.split(u'\t') tllemmas=tllemmastext.split(u'\t') at.parse(textat) at.freq=int(freq) tl_lemmas_from_dictionary_text=piecesOfline[7].strip() tl_lemmas_from_dictionary_list=tl_lemmas_from_dictionary_text.split(u'\t') bilphrase=copy.deepcopy(at) bilphrase.tl_lemmas_from_dictionary=tl_lemmas_from_dictionary_list bilphrase.lexicalise_all(sllemmas,tllemmas) bilphrase.id=numAt bilphrasesSet.add(bilphrase) originalATList.append((at,sllemmas,tllemmas,tl_lemmas_from_dictionary_list)) #print bilphrase.tl_lemmas_from_dictionary print >> sys.stderr, " ....."+str(len(originalATList))+" items." solution=generaliseATs.generalise_by_linear_program(bilphrasesSet,originalATList,taggroups) for at in solution: print at
if uponedir_files: parts_slash = intermediate_input_file_only_number.split("/") parts_slash = parts_slash[:-2] + ["ats"] + parts_slash[-1:] intermediate_input_file_only_number_aux = "/".join(parts_slash) elif read_generalised_bilphrases_from_dir: parts_slash = intermediate_input_file_only_number.split("/") intermediate_input_file_only_number_aux = read_generalised_bilphrases_from_dir + "/" + parts_slash[ -1] gzipSuffix = "" if gzip_files: gzipSuffix = ".gz" #Read ATs finalAlignmentTemplates = ruleLearningLib.AlignmentTemplateSet() atsFileName = intermediate_input_file + ".ats" + gzipSuffix if gzip_files: file = gzip.open(atsFileName, 'r') else: file = open(atsFileName, 'r') id = 0 atsByFreq = dict() for line in file: line = line.strip().decode('utf-8') parts = line.split(u" | ") at = ruleLearningLib.AlignmentTemplate() at.parse(u" | ".join(parts[1:])) at.freq = int(parts[0])
def process_bilingual_phrases(atListWithLemmasList, bilingualPhrases, generalisationOptions, generationMethod, allowedSLLemmas): finalAlignmentTemplates = ruleLearningLib.AlignmentTemplateSet() idAt = 1 structuralVariationsDictionary = dict() lexicalVariationsDictionary = dict() afterwardsDictionary = dict() timeStructuralvariations = 0.0 timeLexicalVariations = 0.0 timeRemovingWrongAlignments = 0.0 timeCorrectAndIncorrect = 0.0 timeAfterwardsRestrictions = 0.0 for atWithLemmas in atListWithLemmasList: at = atWithLemmas[0] sllemmas = atWithLemmas[1] tllemmas = atWithLemmas[2] tllemmasfromdictionary = atWithLemmas[3] debug("Generalising " + str(at) + " | " + str(sllemmas) + " | " + str(tllemmas)) if generationMethod == AlignmentTemplateGenerationMethod.FIRST_APPROACH: subsetsGraph = ruleLearningLib.SubsetGraph() idAt = ruleLearningLib.AlignmentTemplate_generate_all_generalisations_and_add_them( at, sllemmas, tllemmas, tllemmasfromdictionary, finalAlignmentTemplates, idAt, subsetsGraph, True, True, generalisationOptions.get_genWhenEmptyTLCats(), generalisationOptions.get_genWhenEmptySLCats()) elif generationMethod == AlignmentTemplateGenerationMethod.TL_VARIABLES: debug("Checking whether hash '" + str(hash(at)) + "' is in the dictionary |d| = " + str(len(structuralVariationsDictionary)) + ".") #wildcard and reference values if not at in structuralVariationsDictionary: debug("AT not found in structural generalisations") starttime = time() structuralVariationsAts = ruleLearningLib.AlignmentTemplate_generate_all_structural_generalisations( at, generalisationOptions) timeStructuralvariations += (time() - starttime) structuralVariationsDictionary[at] = structuralVariationsAts else: debug( "AT already found in structural generalisations. Not repeating work" ) lemmasposandalignments = at.fast_clone() lemmasposandalignments.remove_all_inflection_tags() cleanAT = lemmasposandalignments.fast_clone() lemmasposandalignments.set_lemmas(sllemmas, tllemmas) lemmasposandalignments.tl_lemmas_from_dictionary = tllemmasfromdictionary #lexicalisations if not lemmasposandalignments in lexicalVariationsDictionary: starttime = time() lexicalVariationsAtsF = ruleLearningLib.AlignmentTemplate_generate_all_lexical_generalisations( cleanAT, sllemmas, tllemmas, tllemmasfromdictionary, generalisationOptions.is_unlexicaliseUnalignedSL()) if allowedSLLemmas: lexicalVariationsAts = [ myat for myat in lexicalVariationsAtsF if tuple(myat.get_sl_lemmas()) in allowedSLLemmas ] else: lexicalVariationsAts = lexicalVariationsAtsF timeLexicalVariations += (time() - starttime) lexicalVariationsDictionary[ lemmasposandalignments] = lexicalVariationsAts #removing alignments starttime = time() for atstruct in structuralVariationsDictionary[at]: for atlex in lexicalVariationsDictionary[ lemmasposandalignments]: newat = atstruct.fast_clone() newat.set_lemmas_from_other_at(atlex) options = newat.get_unalignment_options_for_multiple_aligned_unlexicalised_tl_words( lemmasposandalignments) for option in options: atcopy = newat.fast_clone() atcopy.remove_alignments(option) atcopy.alignments.sort() debug("Obtained AT: " + str(atcopy)) if not atcopy in afterwardsDictionary: afterwardsDictionary[atcopy] = list() afterwardsDictionary[atcopy].append( atcopy.afterwards_restrictions) if not finalAlignmentTemplates.is_in_set(atcopy): debug("is NOT in set") idAt += 1 atcopy.id = idAt finalAlignmentTemplates.add(atcopy) timeRemovingWrongAlignments += (time() - starttime) else: print >> sys.stderr, "WRONG GENERATION METHOD" idAT = len(finalAlignmentTemplates.get_all_ats_list()) finalAlignmentTemplatesAfterwardsRestrictions = AlignmentTemplateSet() if ruleLearningLib.DEBUG: debug("All the bilingual phrases:") for bilphrase in bilingualPhrases.get_all_ats_list(): debug("\t" + str(bilphrase)) tllemmaslocal = u" ".join([ "'" + lem + "'" for lem in bilphrase.tl_lemmas_from_dictionary ]) debug("TL lemmas: " + tllemmaslocal.encode('utf-8')) matchingBilphrasesDict = dict() for at in finalAlignmentTemplates.get_all_ats_list(): starttime = time() idsOk, idMatching, numOk, numMatching = bilingualPhrases.get_ids_of_matching_and_compatible_phrases( at) timeCorrectAndIncorrect += (time() - starttime) matchingBilphrasesDict[at] = (idsOk, idMatching, numOk, numMatching) at.freq = numOk debug("precomputing matching and OK bilingual phrases for at: " + str(at)) debug("numOK: " + str(numOk) + " numMatching: " + str(numMatching)) debug("Final ATs:") for at in finalAlignmentTemplates.get_all_ats_list(): if generalisationOptions.is_refToBiling( ) and not generalisationOptions.is_differentRestrictionOptions( ) and generalisationOptions.is_generalise( ) and not generalisationOptions.is_addRestrictionsForEveryTag(): at.shorten_restrictions() idsOk, idMatching, numOk, numMatching = matchingBilphrasesDict[at] debug(str(at)) debug("with numOK = " + str(numOk) + " and freq = " + str(at.freq)) if generalisationOptions.get_possibleValuesForRestrictions( ) == AT_GeneralisationOptions.VALUE_FOR_RESTRICTION_TRIGGERINGCHANGE: starttime = time() atsSharingLeftSide = list() for atSharing in finalAlignmentTemplates.get_ats_with_same_sllex_and_restrictions( at): if atSharing != at: reproducedBilphrasesOfSharing = AlignmentTemplateSet() incorrectBilphrasesOfSharing = AlignmentTemplateSet() idsOkS, idMatchingS, numOkS, numMatchingS = matchingBilphrasesDict[ atSharing] incorrectIds = set(idMatchingS) - set(idsOkS) for incorrectId in incorrectIds: incorrectBilphrasesOfSharing.add( bilingualPhrases.get_by_id(incorrectId)) for idOK in idsOkS: reproducedBilphrasesOfSharing.add( bilingualPhrases.get_by_id(idOK)) atsSharingLeftSide.append( (atSharing, reproducedBilphrasesOfSharing, incorrectBilphrasesOfSharing, numOkS)) incorrectBilphrases = AlignmentTemplateSet() incorrectIds = set(idMatching) - set(idsOk) for incorrectId in incorrectIds: incorrectBilphrases.add( bilingualPhrases.get_by_id(incorrectId)) reproducedBilphrases = AlignmentTemplateSet() for idOK in idsOk: reproducedBilphrases.add(bilingualPhrases.get_by_id(idOK)) debug("Processing AT to add restrictions: " + str(at)) debug("Matching bilphrases (" + str(len(idMatching)) + "):") if ruleLearningLib.DEBUG: for bid in idMatching: debug("\t" + str(bilingualPhrases.get_by_id(bid))) debug("Reproduced bilphrases (" + str(len(idsOk)) + "):") if ruleLearningLib.DEBUG: for bid in idsOk: debug("\t" + str(bilingualPhrases.get_by_id(bid))) debug("Incorrect bilphrases (" + str(len(incorrectIds)) + ") :") if ruleLearningLib.DEBUG: for inat in incorrectBilphrases.get_all_ats_list(): debug("\t" + str(inat.id) + ": " + inat.to_string( removeRestrictionsFromLexicalised=False)) #represent possible restrictions to be added as tuples allOptions = list() afterwardsRestrictionItemIndex = 0 for afterwards_restriction_item in afterwardsDictionary[at]: afterwardsRestrictionItemIndex += 1 restrictionsAsTuples = list() for i in range(len(afterwards_restriction_item)): #only add restrictions for non-lexicalised words if not at.parsed_sl_lexforms[i].has_lemma(): afterwardDict = afterwards_restriction_item[i] for key in afterwardDict: tuplerep = (i, key, afterwardDict[key]) restrictionsAsTuples.append(tuplerep) debug("Possible values for restrictions " + str(afterwardsRestrictionItemIndex) + ": " + str(restrictionsAsTuples)) #compute power set options = powerset(restrictionsAsTuples) allOptions.extend(options) allOptionsFrozenUniq = list(set([frozenset(o) for o in allOptions])) #sort options by number of components sortedOptions = sorted(allOptionsFrozenUniq, key=len) if generalisationOptions.is_triggeringLimitedLength(): positionOfFirstInvalidOption = None for k in range(len(sortedOptions)): if len(sortedOptions[k]) > len(at.parsed_sl_lexforms): positionOfFirstInvalidOption = k break if positionOfFirstInvalidOption != None: sortedOptions = sortedOptions[: positionOfFirstInvalidOption] incorrectIdsNotMatchingDict = dict() while len(sortedOptions) > 0: opt = sortedOptions.pop(0) optlen = len(opt) debug("Added restrictions option: " + str(opt)) #matchesZero=False #for resSetMatchingZero in restrictionsSetsMatchingZero: # if opt <= resSetMatchingZero: # matchesZero=True # break #if matchesZero: # break newAT = at.fast_clone() newAT.add_restrictions_from_tuples(opt) idsOk, idMatching, numOk, numMatching = incorrectBilphrases.get_ids_of_matching_and_compatible_phrases( newAT) incorrectIdsNotMatching = frozenset(incorrectIds - idMatching) idsOKFromReproducible, idsMatchingFromReproducible, numOkFromRepr, numMatchingFromRepr = reproducedBilphrases.get_ids_of_matching_and_compatible_phrases( newAT) totalReproduciblePhrases = len( reproducedBilphrases.get_all_ids()) numReproduciblePhrasesNowNOtMatching = totalReproduciblePhrases - len( idsOKFromReproducible) debug("Reproducible phrases which now don't match: " + str(numReproduciblePhrasesNowNOtMatching)) atLeastOneValid = False if generalisationOptions.is_discardRestrictionsNotImproving(): for atSharing, reproducedSharing, incorrectSharing, numOkofSharing in atsSharingLeftSide: idsOkS, idMatchingS, numOkS, numMatchingS = incorrectSharing.get_ids_of_matching_and_compatible_phrases( newAT) idsOKFromReproducibleS, idsMatchingFromReproducibleS, numOkFromReprS, numMatchingFromReprS = reproducedSharing.get_ids_of_matching_and_compatible_phrases( newAT) if ruleLearningLib.DEBUG: debug("\tAT sharing left side: " + str(atSharing)) debug("\t New AT matches " + str(numMatchingS) + " bilphrases out of " + str(incorrectSharing.get_total_freq()) + " incorrect bilphrases") debug("\t reproduces " + str(numOkS) + "/" + str(numMatchingS)) debug("\t New AT matches " + str(numMatchingFromReprS) + " bilphrases out of " + str(reproducedSharing.get_total_freq()) + " reproduced bilphrases") debug("\t reproduces " + str(numOkFromReprS) + "/" + str(numMatchingFromReprS)) phrasesCorrectlyReproducedByCombo = set() #first, the bilingual phrases correctly reproduced by atSharing minus the bilingual phrases matched by newAT phrasesCorrectlyReproducedByCombo.update( reproducedSharing.get_all_ids()) phrasesCorrectlyReproducedByCombo.difference_update( idMatchingS) phrasesCorrectlyReproducedByCombo.difference_update( idsMatchingFromReproducibleS) #in addition, the bilingual phrases correctly reproduced by 'newAT' which were matched by AtSharing phrasesCorrectlyReproducedByCombo.update(idsOkS) phrasesCorrectlyReproducedByCombo.update( idsOKFromReproducibleS) totalFreqOfPhrasesReproducedByCombo = sum( bilingualPhrases.get_by_id(bid).freq for bid in phrasesCorrectlyReproducedByCombo) totalFreqOfPhrasesReproducedBySharingAT = numOkofSharing debug("\t" + str(totalFreqOfPhrasesReproducedByCombo) + " phrases reproduced by combo vs. " + str(totalFreqOfPhrasesReproducedBySharingAT) + "phrases reproduced by AT sharing left side") debug("\t" + str(numOkFromRepr) + " phrases reproduced by newAT vs. " + str(totalFreqOfPhrasesReproducedBySharingAT) + "phrases reproduced by AT sharing left side") if numOkFromRepr < totalFreqOfPhrasesReproducedBySharingAT and totalFreqOfPhrasesReproducedByCombo > totalFreqOfPhrasesReproducedBySharingAT and numOkS > numMatchingS / 2: debug("\tRestriction VALID for this shared AT") atLeastOneValid = True else: debug("\tRestriction NOT valid for this shared AT") if (numReproduciblePhrasesNowNOtMatching == 0 or not generalisationOptions.is_triggeringNoGoodDiscarded( )) and (not generalisationOptions. is_discardRestrictionsNotImproving() or atLeastOneValid or optlen == 0): if ruleLearningLib.DEBUG: debug("Incorrect bilphrases which now don't match (" + str(len(incorrectIdsNotMatching)) + "):") for bid in incorrectIdsNotMatching: debug("\t" + str(bilingualPhrases.get_by_id(bid))) if len(incorrectIdsNotMatching) > 0: validAT = True if incorrectIdsNotMatching in incorrectIdsNotMatchingDict: debug( "The same set of bilingual phrases was removed by other sets of restrictions..." ) for pastoption in incorrectIdsNotMatchingDict[ incorrectIdsNotMatching]: if pastoption <= opt: debug( "... and there is a subset of this one: " + str(pastoption)) validAT = False break if validAT: debug("... but no set is a subset of this one") else: debug( "The same set of bilingual phrases was NOT removed by other sets of restrictions." ) incorrectIdsNotMatchingDict[ incorrectIdsNotMatching] = set() incorrectIdsNotMatchingDict[ incorrectIdsNotMatching].add(opt) if validAT: debug("SET OF RESTRICTIONS OK") idAt += 1 newAT.id = idAt finalAlignmentTemplatesAfterwardsRestrictions.add( newAT) if len(idMatching) == 0: debug( "This AT does not match any incorrect bilingual phrase. Removing all its supersets" ) #restrictionsSetsMatchingZero.add(opt) sortedOptionsCopy = list() for sopt in sortedOptions: if not opt <= sopt: sortedOptionsCopy.append(sopt) sortedOptions = sortedOptionsCopy else: debug("Set of restrictions not generated") debug("") timeAfterwardsRestrictions += (time() - starttime) debug("Final ATs with afterwards restrictions:") for at in finalAlignmentTemplatesAfterwardsRestrictions.get_all_ats_list(): starttime = time() idsOk, idMatching, numOk, numMatching = bilingualPhrases.get_ids_of_matching_and_compatible_phrases( at) timeCorrectAndIncorrect += (time() - starttime) at.freq = numOk debug(str(at)) finalAlignmentTemplates.write(sys.stdout) finalAlignmentTemplatesAfterwardsRestrictions.write(sys.stdout) print >> sys.stderr, "Time performing structural generalisation: " + str( timeStructuralvariations) print >> sys.stderr, "Time performing lexical generalisation: " + str( timeLexicalVariations) print >> sys.stderr, "Time removing wrong alignments: " + str( timeRemovingWrongAlignments) print >> sys.stderr, "Time computing correct and matching ATs: " + str( timeCorrectAndIncorrect) print >> sys.stderr, "Time generating afterwards restrictions: " + str( timeAfterwardsRestrictions)
if gzip_files: file = gzip.open(args.ats_with_allowed_lemmas_file, 'r') else: file = open(args.ats_with_allowed_lemmas_file, 'r') for line in file: line = line.strip().decode('utf-8') at = ruleLearningLib.AlignmentTemplate() at.parse(line) allowedSLLemmas.add(tuple(at.get_sl_lemmas())) file.close() #Read bilingual phrases originalATList = list() #store bilingual phrases in an efficient way bilingualPhrases = ruleLearningLib.AlignmentTemplateSet() print >> sys.stderr, "Reading Bilingual Phrases...." bilid = 0 for line in inputFile: line = line.decode('utf-8') at = ruleLearningLib.AlignmentTemplate() bilphrase = ruleLearningLib.AlignmentTemplate() piecesOfline = line.split(u'|') textat = u'|'.join(piecesOfline[1:5]) freq = piecesOfline[0].strip() sllemmastext = piecesOfline[5].strip() tllemmastext = piecesOfline[6].strip()