wordsX = IO.readPickledWords(filename_wordsX) wordsY = IO.readPickledWords(filename_wordsY) else: wordsX = IO.readWords(filename_wordsX) wordsY = IO.readWords(filename_wordsY) if filename_lexicon == 'None': # we don't have a lexicon. assume identity. log(100, 'Using identity lexicon') lex = None gold_lex = dict() # for w in wordsX.words: gold_lex[w] = [w] log(100, gold_lex) else: lex = BilexiconUtil.readLexicon(filename_lexicon) (gold_lex, times) = BilexiconUtil.filterLexicon(lex, wordsX.words, wordsY.words) log(100, 'Done filtering gold lexicon') seed = [] used_targets = set() for source_word in wordsX.words: # go over source if source_word in gold_lex: # check source in lexicon translations = gold_lex[source_word] # get translations of source for translation in translations: # then, append translations for non-translated sources. if translation in wordsY.words and translation not in used_targets: seed.append((source_word, translation)) used_targets.add(translation) print "%s,%s" % (source_word, translation) break else: pass #print source_word, '__NA__'
# cmd line args filename_wordsX = sys.argv[1] filename_wordsY = sys.argv[2] filename_seed = sys.argv[3] options = parseOptions() # read input files wordsX, wordsY, seed_list = readInput(options, filename_wordsX, filename_wordsY, filename_seed) N = len(wordsX.words) options.matchingFilename = "results/matching_N=%d_expid=%d_alpha=%2.2f_T=%d.txt" % ( N, options.exp_id, options.alpha, options.T, ) NSeed = len(seed_list.X) if options.filename_lexicon is not None: lex = BU.readLexicon(options.filename_lexicon) (gold_lex, times) = BU.filterLexicon(lex, wordsX.words[:-NSeed], wordsY.words[:-NSeed]) options.gold_lex = gold_lex print "Gold lexicon contains", len(gold_lex), "pairs." else: options.gold_lex = None print colored("WARNING: No gold lexicon", "red") print >> sys.stderr, "==============#########==========" print >> sys.stderr, "Starting mCCA:" print >> sys.stderr, NSeed, "seed pairs:", zip(seed_list.X, seed_list.Y) (wordsX, wordsY, edge_cost, cost) = mcca(options, wordsX, wordsY, seed_list) log(0, "hamming distance:", perm.hamming(wordsX.words, wordsY.words)) bell()