def main(): # Parse command line arguments parser = argparse.ArgumentParser(description='Normalize word embeddings') parser.add_argument('actions', choices=['unit', 'center', 'unitdim', 'centeremb'], nargs='*', default=[], help='the actions to perform in order') parser.add_argument( '-i', '--input', default=sys.stdin.fileno(), help='the input word embedding file (defaults to stdin)') parser.add_argument( '-o', '--output', default=sys.stdout.fileno(), help='the output word embedding file (defaults to stdout)') parser.add_argument( '--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') args = parser.parse_args() # Read input embeddings f = open(args.input, encoding=args.encoding, errors='surrogateescape') words, matrix = embeddings.read(f) # Perform normalization actions embeddings.normalize(matrix, args.actions) # Write normalized embeddings f = open(args.output, mode='w', encoding=args.encoding, errors='surrogateescape') embeddings.write(words, matrix, f)
def main(): # Parse command line arguments parser = argparse.ArgumentParser(description='Map word embeddings in two languages into a shared space') parser.add_argument('src_input', help='the input source embeddings') parser.add_argument('trg_input', help='the input target embeddings') parser.add_argument('src_output', help='the output source embeddings') parser.add_argument('trg_output', help='the output target embeddings') parser.add_argument('--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp32', help='the floating-point precision (defaults to fp32)') parser.add_argument('--cuda', action='store_true', help='use cuda (requires cupy)') parser.add_argument('--batch_size', default=10000, type=int, help='batch size (defaults to 10000); does not affect results, larger is usually faster but uses more memory') parser.add_argument('--seed', type=int, default=0, help='the random seed (defaults to 0)') parser.add_argument('--test-dict', help='the test dictionary file') recommended_group = parser.add_argument_group('recommended settings', 'Recommended settings for different scenarios') recommended_type = recommended_group.add_mutually_exclusive_group() recommended_type.add_argument('--supervised', metavar='DICTIONARY', help='recommended if you have a large training dictionary') recommended_type.add_argument('--semi_supervised', metavar='DICTIONARY', help='recommended if you have a small seed dictionary') recommended_type.add_argument('--identical', action='store_true', help='recommended if you have no seed dictionary but can rely on identical words') recommended_type.add_argument('--unsupervised', action='store_true', help='recommended if you have no seed dictionary and do not want to rely on identical words') recommended_type.add_argument('--acl2018', action='store_true', help='reproduce our ACL 2018 system') recommended_type.add_argument('--aaai2018', metavar='DICTIONARY', help='reproduce our AAAI 2018 system') recommended_type.add_argument('--acl2017', action='store_true', help='reproduce our ACL 2017 system with numeral initialization') # Note: changed the argument so that dictionary is supplied with -d instead recommended_type.add_argument('--acl2017_seed', action='store_true', help='reproduce our ACL 2017 system with a seed dictionary') recommended_type.add_argument('--emnlp2016', metavar='DICTIONARY', help='reproduce our EMNLP 2016 system') # still requires specifying a seed dictionary or another init recommended_type.add_argument('--ruder_emnlp2018', action='store_true', help='reproduce EMNLP 2018 latent-variable model of Ruder et al.') recommended_type.add_argument('--ruder_emnlp2018_backward', action='store_true', help='reproduce Ruder et al. (EMNLP 2018) with matching in backward direction') recommended_type.add_argument('--ruder_emnlp2018_artetxe_acl2018_unsupervised', action='store_true', help='reproduce Ruder et al. (EMNLP 2018) with matching in backward direction') recommended_type.add_argument('--ruder_emnlp2018_artetxe_acl2018', action='store_true', help='reproduce Ruder et al. (EMNLP 2018) with matching in backward direction') init_group = parser.add_argument_group('advanced initialization arguments', 'Advanced initialization arguments') init_type = init_group.add_mutually_exclusive_group() init_type.add_argument('-d', '--init_dictionary', default=sys.stdin.fileno(), metavar='DICTIONARY', help='the training dictionary file (defaults to stdin)') init_type.add_argument('--init_identical', action='store_true', help='use identical words as the seed dictionary') init_type.add_argument('--init_numerals', action='store_true', help='use latin numerals (i.e. words matching [0-9]+) as the seed dictionary') init_type.add_argument('--init_unsupervised', action='store_true', help='use unsupervised initialization') init_group.add_argument('--unsupervised_vocab', type=int, default=0, help='restrict the vocabulary to the top k entries for unsupervised initialization') mapping_group = parser.add_argument_group('advanced mapping arguments', 'Advanced embedding mapping arguments') mapping_group.add_argument('--normalize', choices=['unit', 'center', 'unitdim', 'centeremb', 'none'], nargs='*', default=[], help='the normalization actions to perform in order') mapping_group.add_argument('--whiten', action='store_true', help='whiten the embeddings') mapping_group.add_argument('--src_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the source language embeddings') mapping_group.add_argument('--trg_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the target language embeddings') mapping_group.add_argument('--src_dewhiten', choices=['src', 'trg'], help='de-whiten the source language embeddings') mapping_group.add_argument('--trg_dewhiten', choices=['src', 'trg'], help='de-whiten the target language embeddings') mapping_group.add_argument('--dim_reduction', type=int, default=0, help='apply dimensionality reduction') mapping_type = mapping_group.add_mutually_exclusive_group() mapping_type.add_argument('-c', '--orthogonal', action='store_true', help='use orthogonal constrained mapping') mapping_type.add_argument('-u', '--unconstrained', action='store_true', help='use unconstrained mapping') self_learning_group = parser.add_argument_group('advanced self-learning arguments', 'Advanced arguments for self-learning') self_learning_group.add_argument('--self_learning', action='store_true', help='enable self-learning') self_learning_group.add_argument('--vocabulary_cutoff', type=int, default=0, help='restrict the vocabulary to the top k entries') self_learning_group.add_argument('--direction', choices=['forward', 'backward', 'union'], default='union', help='the direction for dictionary induction (defaults to union)') self_learning_group.add_argument('--csls', type=int, nargs='?', default=0, const=10, metavar='NEIGHBORHOOD_SIZE', dest='csls_neighborhood', help='use CSLS for dictionary induction') self_learning_group.add_argument('--threshold', default=0.000001, type=float, help='the convergence threshold (defaults to 0.000001)') self_learning_group.add_argument('--validation', default=None, metavar='DICTIONARY', help='a dictionary file for validation at each iteration') self_learning_group.add_argument('--stochastic_initial', default=0.1, type=float, help='initial keep probability stochastic dictionary induction (defaults to 0.1)') self_learning_group.add_argument('--stochastic_multiplier', default=2.0, type=float, help='stochastic dictionary induction multiplier (defaults to 2.0)') self_learning_group.add_argument('--stochastic_interval', default=50, type=int, help='stochastic dictionary induction interval (defaults to 50)') self_learning_group.add_argument('--log', help='write to a log file in tsv format at each iteration') self_learning_group.add_argument('-v', '--verbose', action='store_true', help='write log information to stderr at each iteration') lat_var_group = parser.add_argument_group('arguments for latent-variable model', 'Arguments for latent-variable model') lat_var_group.add_argument('--lat-var', action='store_true', help='use the latent-variable model') lat_var_group.add_argument('--n-similar', type=int, default=3, help='# of most similar trg indices used for sparsifying in latent-variable model') lat_var_group.add_argument('--n-repeats', default=1, type=int, help='repeats embeddings to get 2:2, 3:3, etc. alignment in latent-variable model') lat_var_group.add_argument('--asym', default='1:1', help='specify 1:2 or 2:1 for assymmetric matching in latent-variable model') args = parser.parse_args() if args.supervised is not None: parser.set_defaults(init_dictionary=args.supervised, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', batch_size=1000) if args.semi_supervised is not None: parser.set_defaults(init_dictionary=args.semi_supervised, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10) if args.identical: parser.set_defaults(init_identical=True, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10) # reduce stochastic interval # note: just backward direction works surprisingly well if args.ruder_emnlp2018_artetxe_acl2018_unsupervised: parser.set_defaults(init_unsupervised=True, unsupervised_vocab=4000, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=40000, csls_neighborhood=10, lat_var=True, n_similar=3, direction='union', stochastic_interval=3) if args.ruder_emnlp2018_artetxe_acl2018: parser.set_defaults(normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=40000, csls_neighborhood=10, lat_var=True, n_similar=3, direction='union', stochastic_interval=3) if args.ruder_emnlp2018: parser.set_defaults(orthogonal=True, normalize=['unit', 'center'], self_learning=True, direction='forward', stochastic_initial=1.0, stochastic_interval=1, batch_size=1000, lat_var=True, n_similar=3, vocabulary_cutoff=40000) if args.ruder_emnlp2018_backward: parser.set_defaults(orthogonal=True, normalize=['unit', 'center'], self_learning=True, direction='backward', stochastic_initial=1.0, stochastic_interval=1, batch_size=1000, lat_var=True, n_similar=3, vocabulary_cutoff=40000) if args.unsupervised or args.acl2018: parser.set_defaults(init_unsupervised=True, unsupervised_vocab=4000, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10) if args.aaai2018: parser.set_defaults(init_dictionary=args.aaai2018, normalize=['unit', 'center'], whiten=True, trg_reweight=1, src_dewhiten='src', trg_dewhiten='trg', batch_size=1000) if args.acl2017: parser.set_defaults(init_numerals=True, orthogonal=True, normalize=['unit', 'center'], self_learning=True, direction='forward', stochastic_initial=1.0, stochastic_interval=1, batch_size=1000) if args.acl2017_seed: parser.set_defaults(init_dictionary=args.init_dictionary, orthogonal=True, normalize=['unit', 'center'], self_learning=True, direction='forward', stochastic_initial=1.0, stochastic_interval=1, batch_size=1000) if args.emnlp2016: parser.set_defaults(init_dictionary=args.emnlp2016, orthogonal=True, normalize=['unit', 'center'], batch_size=1000) args = parser.parse_args() # Check command line arguments if (args.src_dewhiten is not None or args.trg_dewhiten is not None) and not args.whiten: print('ERROR: De-whitening requires whitening first', file=sys.stderr) sys.exit(-1) if args.verbose: print("Info: arguments\n\t" + "\n\t".join( ["{}: {}".format(a, v) for a, v in vars(args).items()]), file=sys.stderr) # Choose the right dtype for the desired precision if args.precision == 'fp16': dtype = 'float16' elif args.precision == 'fp32': dtype = 'float32' elif args.precision == 'fp64': dtype = 'float64' # Read input embeddings srcfile = open(args.src_input, encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_input, encoding=args.encoding, errors='surrogateescape') src_words, x = embeddings.read(srcfile, dtype=dtype, threshold=200000) trg_words, z = embeddings.read(trgfile, dtype=dtype, threshold=200000) # NumPy/CuPy management if args.cuda: if not supports_cupy(): print('ERROR: Install CuPy for CUDA support', file=sys.stderr) sys.exit(-1) xp = get_cupy() x = xp.asarray(x) z = xp.asarray(z) else: xp = np xp.random.seed(args.seed) # Build word to index map src_word2ind = {word: i for i, word in enumerate(src_words)} trg_word2ind = {word: i for i, word in enumerate(trg_words)} # STEP 0: Normalization embeddings.normalize(x, args.normalize) embeddings.normalize(z, args.normalize) # Build the seed dictionary src_indices = [] trg_indices = [] if args.init_unsupervised: if args.verbose: print('Using unsupervised initialization...') sim_size = min(x.shape[0], z.shape[0]) if args.unsupervised_vocab <= 0 else min(x.shape[0], z.shape[0], args.unsupervised_vocab) u, s, vt = xp.linalg.svd(x[:sim_size], full_matrices=False) xsim = (u*s).dot(u.T) u, s, vt = xp.linalg.svd(z[:sim_size], full_matrices=False) zsim = (u*s).dot(u.T) del u, s, vt xsim.sort(axis=1) zsim.sort(axis=1) embeddings.normalize(xsim, args.normalize) embeddings.normalize(zsim, args.normalize) sim = xsim.dot(zsim.T) if args.csls_neighborhood > 0: knn_sim_fwd = topk_mean(sim, k=args.csls_neighborhood) knn_sim_bwd = topk_mean(sim.T, k=args.csls_neighborhood) sim -= knn_sim_fwd[:, xp.newaxis]/2 + knn_sim_bwd/2 if args.direction == 'forward': src_indices = xp.arange(sim_size) trg_indices = sim.argmax(axis=1) elif args.direction == 'backward': src_indices = sim.argmax(axis=0) trg_indices = xp.arange(sim_size) elif args.direction == 'union': src_indices = xp.concatenate((xp.arange(sim_size), sim.argmax(axis=0))) trg_indices = xp.concatenate((sim.argmax(axis=1), xp.arange(sim_size))) del xsim, zsim, sim elif args.init_numerals: if args.verbose: print('Using numerals as seeds...') numeral_regex = re.compile('^[0-9]+$') src_numerals = {word for word in src_words if numeral_regex.match(word) is not None} trg_numerals = {word for word in trg_words if numeral_regex.match(word) is not None} numerals = src_numerals.intersection(trg_numerals) for word in numerals: src_indices.append(src_word2ind[word]) trg_indices.append(trg_word2ind[word]) elif args.init_identical: identical = set(src_words).intersection(set(trg_words)) if args.verbose: print('Using identical strings as seeds...') print(f'Found {len(identical)} identical strings.') for word in identical: src_indices.append(src_word2ind[word]) trg_indices.append(trg_word2ind[word]) else: f = open(args.init_dictionary, encoding=args.encoding, errors='surrogateescape') for line in f: src, trg = line.split() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src_indices.append(src_ind) trg_indices.append(trg_ind) except KeyError: print('WARNING: OOV dictionary entry ({0} - {1})'.format(src, trg), file=sys.stderr) print(f'Using a dictionary of size {len(src_indices)}.') # Read validation dictionary if args.validation is not None: f = open(args.validation, encoding=args.encoding, errors='surrogateescape') validation = collections.defaultdict(set) oov = set() vocab = set() for line in f: src, trg = line.split() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] validation[src_ind].add(trg_ind) vocab.add(src) except KeyError: oov.add(src) oov -= vocab # If one of the translation options is in the vocabulary, then the entry is not an oov validation_coverage = len(validation) / (len(validation) + len(oov)) # Create log file if args.log: log = open(args.log, mode='w', encoding=args.encoding, errors='surrogateescape') # Allocate memory xw = xp.empty_like(x) zw = xp.empty_like(z) src_size = x.shape[0] if args.vocabulary_cutoff <= 0 else min(x.shape[0], args.vocabulary_cutoff) trg_size = z.shape[0] if args.vocabulary_cutoff <= 0 else min(z.shape[0], args.vocabulary_cutoff) simfwd = xp.empty((args.batch_size, trg_size), dtype=dtype) simbwd = xp.empty((args.batch_size, src_size), dtype=dtype) if args.validation is not None: simval = xp.empty((len(validation.keys()), z.shape[0]), dtype=dtype) best_sim_forward = xp.full(src_size, -100, dtype=dtype) src_indices_forward = xp.arange(src_size) trg_indices_forward = xp.zeros(src_size, dtype=int) best_sim_backward = xp.full(trg_size, -100, dtype=dtype) src_indices_backward = xp.zeros(trg_size, dtype=int) trg_indices_backward = xp.arange(trg_size) knn_sim_fwd = xp.zeros(src_size, dtype=dtype) knn_sim_bwd = xp.zeros(trg_size, dtype=dtype) # Training loop best_objective = objective = -100. it = 1 last_improvement = 0 keep_prob = args.stochastic_initial t = time.time() end = not args.self_learning while True: # Increase the keep probability if we have not improve in args.stochastic_interval iterations if it - last_improvement > args.stochastic_interval: if keep_prob >= 1.0: end = True keep_prob = min(1.0, args.stochastic_multiplier*keep_prob) last_improvement = it # Update the embedding mapping if args.orthogonal or not end: # orthogonal mapping u, s, vt = xp.linalg.svd(z[trg_indices].T.dot(x[src_indices])) w = vt.T.dot(u.T) x.dot(w, out=xw) zw[:] = z elif args.unconstrained: # unconstrained mapping x_pseudoinv = xp.linalg.inv(x[src_indices].T.dot(x[src_indices])).dot(x[src_indices].T) w = x_pseudoinv.dot(z[trg_indices]) x.dot(w, out=xw) zw[:] = z else: # advanced mapping # TODO xw.dot(wx2, out=xw) and alike not working xw[:] = x zw[:] = z # STEP 1: Whitening def whitening_transformation(m): u, s, vt = xp.linalg.svd(m, full_matrices=False) return vt.T.dot(xp.diag(1/s)).dot(vt) if args.whiten: wx1 = whitening_transformation(xw[src_indices]) wz1 = whitening_transformation(zw[trg_indices]) xw = xw.dot(wx1) zw = zw.dot(wz1) # STEP 2: Orthogonal mapping wx2, s, wz2_t = xp.linalg.svd(xw[src_indices].T.dot(zw[trg_indices])) wz2 = wz2_t.T xw = xw.dot(wx2) zw = zw.dot(wz2) # STEP 3: Re-weighting xw *= s**args.src_reweight zw *= s**args.trg_reweight # STEP 4: De-whitening if args.src_dewhiten == 'src': xw = xw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2)) elif args.src_dewhiten == 'trg': xw = xw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2)) if args.trg_dewhiten == 'src': zw = zw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2)) elif args.trg_dewhiten == 'trg': zw = zw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2)) # STEP 5: Dimensionality reduction if args.dim_reduction > 0: xw = xw[:, :args.dim_reduction] zw = zw[:, :args.dim_reduction] # Self-learning if end: break else: # Update the training dictionary sims = np.zeros((src_size, trg_size), dtype=dtype) if args.direction in ('forward', 'union'): if args.csls_neighborhood > 0: for i in range(0, trg_size, simbwd.shape[0]): j = min(i + simbwd.shape[0], trg_size) zw[i:j].dot(xw[:src_size].T, out=simbwd[:j-i]) knn_sim_bwd[i:j] = topk_mean(simbwd[:j-i], k=args.csls_neighborhood, inplace=True) for i in range(0, src_size, simfwd.shape[0]): j = min(i + simfwd.shape[0], src_size) xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j-i]) simfwd[:j-i].max(axis=1, out=best_sim_forward[i:j]) simfwd[:j-i] -= knn_sim_bwd/2 # Equivalent to the real CSLS scores for NN simfwd[:j-i] = dropout(simfwd[:j-i], 1 - keep_prob) if not args.lat_var: # we get a dimension mismatch here as lat_var may produce fewer seeds simfwd[:j-i].argmax(axis=1, out=trg_indices_forward[i:j]) sims[i:j] = simfwd if args.lat_var: # TODO check if we can save memory by not storing a large sims matrix src_indices_forward, trg_indices_forward = lat_var.lat_var( xp, sims, args.n_similar, args.n_repeats, args.batch_size, args.asym) if args.direction in ('backward', 'union'): if args.csls_neighborhood > 0: for i in range(0, src_size, simfwd.shape[0]): j = min(i + simfwd.shape[0], src_size) xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j-i]) knn_sim_fwd[i:j] = topk_mean(simfwd[:j-i], k=args.csls_neighborhood, inplace=True) for i in range(0, trg_size, simbwd.shape[0]): j = min(i + simbwd.shape[0], trg_size) zw[i:j].dot(xw[:src_size].T, out=simbwd[:j-i]) simbwd[:j-i].max(axis=1, out=best_sim_backward[i:j]) simbwd[:j-i] -= knn_sim_fwd/2 # Equivalent to the real CSLS scores for NN simbwd[:j-i] = dropout(simbwd[:j-i], 1 - keep_prob) if not args.lat_var: simbwd[:j-i].argmax(axis=1,out=src_indices_backward[i:j]) sims[i:j] = simbwd if args.lat_var: # swap the order of the indices trg_indices_backward, src_indices_backward = lat_var.lat_var( xp, sims, args.n_similar, args.n_repeats, args.batch_size, args.asym) if args.direction == 'forward': src_indices = src_indices_forward trg_indices = trg_indices_forward elif args.direction == 'backward': src_indices = src_indices_backward trg_indices = trg_indices_backward elif args.direction == 'union': src_indices = xp.concatenate((src_indices_forward, src_indices_backward)) trg_indices = xp.concatenate((trg_indices_forward, trg_indices_backward)) # elif args.direction == 'intersection': # fwd_pairs = zip(src_indices_forward, trg_indices_forward) # bwd_pairs = zip(src_indices_backward, trg_indices_backward) # src_indices, trg_indices = zip(*set(fwd_pairs).intersection(bwd_pairs)) # src_indices, trg_indices = xp.array(src_indices), xp.array(trg_indices) # Objective function evaluation if args.direction == 'forward': objective = xp.mean(best_sim_forward).tolist() elif args.direction == 'backward': objective = xp.mean(best_sim_backward).tolist() elif args.direction == 'union': objective = (xp.mean(best_sim_forward) + xp.mean(best_sim_backward)).tolist() / 2 if objective - best_objective >= args.threshold: last_improvement = it best_objective = objective # Accuracy and similarity evaluation in validation if args.validation is not None: src = list(validation.keys()) xw[src].dot(zw.T, out=simval) nn = asnumpy(simval.argmax(axis=1)) accuracy = np.mean([1 if nn[i] in validation[src[i]] else 0 for i in range(len(src))]) similarity = np.mean([max([simval[i, j].tolist() for j in validation[src[i]]]) for i in range(len(src))]) # Logging duration = time.time() - t if args.verbose: print(file=sys.stderr) print('ITERATION {0} ({1:.2f}s)'.format(it, duration), file=sys.stderr) print('\t- Objective: {0:9.4f}%'.format(100 * objective), file=sys.stderr) print('\t- Drop probability: {0:9.4f}%'.format(100 - 100*keep_prob), file=sys.stderr) if args.validation is not None: print('\t- Val. similarity: {0:9.4f}%'.format(100 * similarity), file=sys.stderr) print('\t- Val. accuracy: {0:9.4f}%'.format(100 * accuracy), file=sys.stderr) print('\t- Val. coverage: {0:9.4f}%'.format(100 * validation_coverage), file=sys.stderr) sys.stderr.flush() if args.log is not None: val = '{0:.6f}\t{1:.6f}\t{2:.6f}'.format( 100 * similarity, 100 * accuracy, 100 * validation_coverage) if args.validation is not None else '' print('{0}\t{1:.6f}\t{2}\t{3:.6f}'.format(it, 100 * objective, val, duration), file=log) log.flush() t = time.time() it += 1 if args.test_dict: # save the embeddings for evaluation with open(args.src_output, mode='w', encoding=args.encoding, errors='surrogateescape') as srcfile,\ open(args.trg_output, mode='w', encoding=args.encoding, errors='surrogateescape') as trgfile: embeddings.write(src_words, xw, srcfile) embeddings.write(trg_words, zw, trgfile) # EVALUATING TRANSLATION print('Evaluating translation...') # we skip length normalization here # Read dictionary and compute coverage f = open(args.test_dict, encoding=args.encoding, errors='surrogateescape') src2trg = collections.defaultdict(set) oov = set() vocab = set() for line in f: src, trg = line.split() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src2trg[src_ind].add(trg_ind) vocab.add(src) except KeyError: oov.add(src) src = list(src2trg.keys()) oov -= vocab # If one of the translation options is in the vocabulary, then the entry is not an oov coverage = len(src2trg) / (len(src2trg) + len(oov)) BATCH_SIZE = 500 # Find translations translation = collections.defaultdict(int) # we just use nearest neighbour for retrieval for i in range(0, len(src), BATCH_SIZE): j = min(i + BATCH_SIZE, len(src)) similarities = xw[src[i:j]].dot(zw.T) nn = similarities.argmax(axis=1).tolist() for k in range(j - i): translation[src[i + k]] = nn[k] # Compute accuracy accuracy = np.mean( [1 if translation[i] in src2trg[i] else 0 for i in src]) print('Coverage:{0:7.2%} Accuracy:{1:7.2%}'.format(coverage, accuracy)) # Write mapped embeddings with open(args.src_output, mode='w', encoding=args.encoding, errors='surrogateescape') as srcfile, \ open(args.trg_output, mode='w', encoding=args.encoding, errors='surrogateescape') as trgfile: embeddings.write(src_words, xw, srcfile) embeddings.write(trg_words, zw, trgfile)
def main(): # Parse command line arguments parser = argparse.ArgumentParser(description='Map word embeddings in two languages into a shared space') parser.add_argument('src_input', help='the input source embeddings') parser.add_argument('trg_input', help='the input target embeddings') parser.add_argument('src_output', help='the output source embeddings') parser.add_argument('trg_output', help='the output target embeddings') parser.add_argument('dict_output', default='dictionary.pkl', help='the output dictionary pickle file') parser.add_argument('--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp32', help='the floating-point precision (defaults to fp32)') parser.add_argument('--cuda', action='store_true', help='use cuda (requires cupy)') parser.add_argument('--batch_size', default=10000, type=int, help='batch size (defaults to 10000); does not affect results, larger is usually faster but uses more memory') parser.add_argument('--seed', type=int, default=0, help='the random seed (defaults to 0)') recommended_group = parser.add_argument_group('recommended settings', 'Recommended settings for different scenarios') recommended_type = recommended_group.add_mutually_exclusive_group() recommended_type.add_argument('--supervised', metavar='DICTIONARY', help='recommended if you have a large training dictionary') recommended_type.add_argument('--semi_supervised', metavar='DICTIONARY', help='recommended if you have a small seed dictionary') recommended_type.add_argument('--identical', action='store_true', help='recommended if you have no seed dictionary but can rely on identical words') recommended_type.add_argument('--unsupervised', action='store_true', help='recommended if you have no seed dictionary and do not want to rely on identical words') recommended_type.add_argument('--future', action='store_true', help='experiment with stuff') recommended_type.add_argument('--acl2018', action='store_true', help='reproduce our ACL 2018 system') recommended_type.add_argument('--aaai2018', metavar='DICTIONARY', help='reproduce our AAAI 2018 system') recommended_type.add_argument('--acl2017', action='store_true', help='reproduce our ACL 2017 system with numeral initialization') recommended_type.add_argument('--acl2017_seed', metavar='DICTIONARY', help='reproduce our ACL 2017 system with a seed dictionary') recommended_type.add_argument('--emnlp2016', metavar='DICTIONARY', help='reproduce our EMNLP 2016 system') init_group = parser.add_argument_group('advanced initialization arguments', 'Advanced initialization arguments') init_type = init_group.add_mutually_exclusive_group() init_type.add_argument('-d', '--init_dictionary', default=sys.stdin.fileno(), metavar='DICTIONARY', help='the training dictionary file (defaults to stdin)') init_type.add_argument('--init_identical', action='store_true', help='use identical words as the seed dictionary') init_type.add_argument('--init_numerals', action='store_true', help='use latin numerals (i.e. words matching [0-9]+) as the seed dictionary') init_type.add_argument('--init_unsupervised', action='store_true', help='use unsupervised initialization') init_group.add_argument('--unsupervised_vocab', type=int, default=0, help='restrict the vocabulary to the top k entries for unsupervised initialization') mapping_group = parser.add_argument_group('advanced mapping arguments', 'Advanced embedding mapping arguments') mapping_group.add_argument('--normalize', choices=['unit', 'center', 'unitdim', 'centeremb', 'none'], nargs='*', default=[], help='the normalization actions to perform in order') mapping_group.add_argument('--whiten', action='store_true', help='whiten the embeddings') mapping_group.add_argument('--src_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the source language embeddings') mapping_group.add_argument('--trg_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the target language embeddings') mapping_group.add_argument('--src_dewhiten', choices=['src', 'trg'], help='de-whiten the source language embeddings') mapping_group.add_argument('--trg_dewhiten', choices=['src', 'trg'], help='de-whiten the target language embeddings') mapping_group.add_argument('--dim_reduction', type=int, default=0, help='apply dimensionality reduction') mapping_type = mapping_group.add_mutually_exclusive_group() mapping_type.add_argument('-c', '--orthogonal', action='store_true', help='use orthogonal constrained mapping') mapping_type.add_argument('-u', '--unconstrained', action='store_true', help='use unconstrained mapping') future_group = parser.add_argument_group('experimental arguments', 'Experimental arguments') future_group.add_argument('--max_align', type=int, default=1, help='Number of top-ranked elements to align to each word (defaults to 1=base)') future_group.add_argument('--align_weight', choices=['unit', 'rr', 'softmax'], default='rr', help='Weights assigned to ranked elements in maximization phase (unit - no weighting; rr - reciprocal rank; softmax - NOT IMPLEMENTED YET)') self_learning_group = parser.add_argument_group('advanced self-learning arguments', 'Advanced arguments for self-learning') self_learning_group.add_argument('--self_learning', action='store_true', help='enable self-learning') self_learning_group.add_argument('--vocabulary_cutoff', type=int, default=0, help='restrict the vocabulary to the top k entries') self_learning_group.add_argument('--direction', choices=['forward', 'backward', 'union'], default='union', help='the direction for dictionary induction (defaults to union)') self_learning_group.add_argument('--csls', type=int, nargs='?', default=0, const=10, metavar='NEIGHBORHOOD_SIZE', dest='csls_neighborhood', help='use CSLS for dictionary induction') self_learning_group.add_argument('--threshold', default=0.000001, type=float, help='the convergence threshold (defaults to 0.000001)') self_learning_group.add_argument('--validation', default=None, metavar='DICTIONARY', help='a dictionary file for validation at each iteration') self_learning_group.add_argument('--stochastic_initial', default=0.1, type=float, help='initial keep probability stochastic dictionary induction (defaults to 0.1)') self_learning_group.add_argument('--stochastic_multiplier', default=2.0, type=float, help='stochastic dictionary induction multiplier (defaults to 2.0)') self_learning_group.add_argument('--stochastic_interval', default=50, type=int, help='stochastic dictionary induction interval (defaults to 50)') self_learning_group.add_argument('--log', default='map.log', help='write to a log file in tsv format at each iteration') self_learning_group.add_argument('-v', '--verbose', action='store_true', help='write log information to stderr at each iteration') args = parser.parse_args() if args.supervised is not None: parser.set_defaults(init_dictionary=args.supervised, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', batch_size=1000) if args.semi_supervised is not None: parser.set_defaults(init_dictionary=args.semi_supervised, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10) if args.identical: parser.set_defaults(init_identical=True, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10) if args.unsupervised or args.future: parser.set_defaults(init_unsupervised=True, unsupervised_vocab=4000, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10, max_align=2, align_weight='rr') if args.unsupervised or args.acl2018: parser.set_defaults(init_unsupervised=True, unsupervised_vocab=4000, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10) if args.aaai2018: parser.set_defaults(init_dictionary=args.aaai2018, normalize=['unit', 'center'], whiten=True, trg_reweight=1, src_dewhiten='src', trg_dewhiten='trg', batch_size=1000) if args.acl2017: parser.set_defaults(init_numerals=True, orthogonal=True, normalize=['unit', 'center'], self_learning=True, direction='forward', stochastic_initial=1.0, stochastic_interval=1, batch_size=1000) if args.acl2017_seed: parser.set_defaults(init_dictionary=args.acl2017_seed, orthogonal=True, normalize=['unit', 'center'], self_learning=True, direction='forward', stochastic_initial=1.0, stochastic_interval=1, batch_size=1000) if args.emnlp2016: parser.set_defaults(init_dictionary=args.emnlp2016, orthogonal=True, normalize=['unit', 'center'], batch_size=1000) args = parser.parse_args() # Check command line arguments if (args.src_dewhiten is not None or args.trg_dewhiten is not None) and not args.whiten: print('ERROR: De-whitening requires whitening first', file=sys.stderr) sys.exit(-1) # Choose the right dtype for the desired precision if args.precision == 'fp16': dtype = 'float16' elif args.precision == 'fp32': dtype = 'float32' elif args.precision == 'fp64': dtype = 'float64' # Read input embeddings print('reading embeddings...') srcfile = open(args.src_input, encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_input, encoding=args.encoding, errors='surrogateescape') src_words, x = embeddings.read(srcfile, dtype=dtype) trg_words, z = embeddings.read(trgfile, dtype=dtype) print('embeddings read') # NumPy/CuPy management if args.cuda: if not supports_cupy(): print('ERROR: Install CuPy for CUDA support', file=sys.stderr) sys.exit(-1) xp = get_cupy() x = xp.asarray(x) z = xp.asarray(z) print('CUDA loaded') else: xp = np xp.random.seed(args.seed) # Build word to index map (only relevant in supervised learning or with validation) src_word2ind = {word: i for i, word in enumerate(src_words)} print(f'mapped {len(src_words)} source words') trg_word2ind = {word: i for i, word in enumerate(trg_words)} print(f'mapped {len(trg_words)} target words') # STEP 0: Normalization embeddings.normalize(x, args.normalize) embeddings.normalize(z, args.normalize) print('normalization complete') # Build the seed dictionary src_indices = [] trg_indices = [] if args.init_unsupervised: sim_size = min(x.shape[0], z.shape[0]) if args.unsupervised_vocab <= 0 else min(x.shape[0], z.shape[0], args.unsupervised_vocab) u, s, vt = xp.linalg.svd(x[:sim_size], full_matrices=False) xsim = (u*s).dot(u.T) u, s, vt = xp.linalg.svd(z[:sim_size], full_matrices=False) zsim = (u*s).dot(u.T) del u, s, vt xsim.sort(axis=1) zsim.sort(axis=1) embeddings.normalize(xsim, args.normalize) embeddings.normalize(zsim, args.normalize) sim = xsim.dot(zsim.T) if args.csls_neighborhood > 0: knn_sim_fwd = topk_mean(sim, k=args.csls_neighborhood) knn_sim_bwd = topk_mean(sim.T, k=args.csls_neighborhood) sim -= knn_sim_fwd[:, xp.newaxis]/2 + knn_sim_bwd/2 if args.direction == 'forward': src_indices = xp.arange(sim_size) trg_indices = sim.argmax(axis=1) elif args.direction == 'backward': src_indices = sim.argmax(axis=0) trg_indices = xp.arange(sim_size) elif args.direction == 'union': src_indices = xp.concatenate((xp.arange(sim_size), sim.argmax(axis=0))) trg_indices = xp.concatenate((sim.argmax(axis=1), xp.arange(sim_size))) del xsim, zsim, sim print(f'initialized unsupervised dictionary') elif args.init_numerals: numeral_regex = re.compile('^[0-9]+$') src_numerals = {word for word in src_words if numeral_regex.match(word) is not None} trg_numerals = {word for word in trg_words if numeral_regex.match(word) is not None} numerals = src_numerals.intersection(trg_numerals) for word in numerals: src_indices.append(src_word2ind[word]) trg_indices.append(trg_word2ind[word]) print('initialized numeral dictionary') elif args.init_identical: identical = set(src_words).intersection(set(trg_words)) for word in identical: src_indices.append(src_word2ind[word]) trg_indices.append(trg_word2ind[word]) print('initialized identical dictionary') else: f = open(args.init_dictionary, encoding=args.encoding, errors='surrogateescape') for line in f: src, trg = line.split() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src_indices.append(src_ind) trg_indices.append(trg_ind) except KeyError: print('WARNING: OOV dictionary entry ({0} - {1})'.format(src, trg), file=sys.stderr) f.close() print('initialized seed dictionary') # Read validation dictionary if args.validation is not None: f = open(args.validation, encoding=args.encoding, errors='surrogateescape') validation = collections.defaultdict(set) oov = set() vocab = set() for line in f: src, trg = line.split() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] validation[src_ind].add(trg_ind) vocab.add(src) except KeyError: oov.add(src) oov -= vocab # If one of the translation options is in the vocabulary, then the entry is not an oov validation_coverage = len(validation) / (len(validation) + len(oov)) print(f'loaded validation dictionary with {validation_coverage:.3f} coverage') # Create log file if args.log: log = open(args.log, mode='w', encoding=args.encoding, errors='surrogateescape') print(f'logging into {args.log}') # Allocate memory xw = xp.empty_like(x) zw = xp.empty_like(z) src_size = x.shape[0] if args.vocabulary_cutoff <= 0 else min(x.shape[0], args.vocabulary_cutoff) trg_size = z.shape[0] if args.vocabulary_cutoff <= 0 else min(z.shape[0], args.vocabulary_cutoff) simfwd = xp.empty((min(src_size, args.batch_size), trg_size), dtype=dtype) simbwd = xp.empty((min(trg_size, args.batch_size), src_size), dtype=dtype) #argsimsf = xp.empty((min(src_size, args.batch_size), args.max_align), dtype=int) #argsimsb = xp.empty((min(trg_size, args.batch_size), args.max_align), dtype=int) argsimsf = xp.empty((min(src_size, args.batch_size), 1), dtype=int) argsimsb = xp.empty((min(trg_size, args.batch_size), 1), dtype=int) if args.validation is not None: simval = xp.empty((len(validation.keys()), z.shape[0]), dtype=dtype) best_sim_forward = xp.full(src_size, -100, dtype=dtype) src_indices_forward = xp.array(list(range(src_size)) * args.max_align) trg_indices_forward = xp.zeros(src_size * args.max_align, dtype=int) best_sim_backward = xp.full(trg_size, -100, dtype=dtype) src_indices_backward = xp.zeros(trg_size * args.max_align, dtype=int) trg_indices_backward = xp.array(list(range(trg_size)) * args.max_align) xr = xp.zeros(((src_size+trg_size) * args.max_align, x.shape[1]), dtype=dtype) # assumes "both" param zr = xp.zeros(((src_size+trg_size) * args.max_align, z.shape[1]), dtype=dtype) # assumes "both" param all_coefs = xp.zeros(((src_size+trg_size) * args.max_align, 1), dtype=dtype) knn_sim_fwd = xp.zeros(src_size, dtype=dtype) knn_sim_bwd = xp.zeros(trg_size, dtype=dtype) # Training loop best_objective = objective = -100. it = 1 last_improvement = 0 keep_prob = args.stochastic_initial t = time.time() end = not args.self_learning print('starting training') while True: if it % 50 == 0: print(f'starting iteration {it}') # Increase the keep probability if we have not improved in args.stochastic_interval iterations if it - last_improvement > args.stochastic_interval: if keep_prob >= 1.0: end = True keep_prob = min(1.0, args.stochastic_multiplier*keep_prob) last_improvement = it # Update the embedding mapping (only affecting vectors that have dictionary mappings) if args.orthogonal or not end: # orthogonal mapping if it == 1: # only initialized alignment available u, s, vt = xp.linalg.svd(z[trg_indices].T.dot(x[src_indices])) else: if args.align_weight == 'softmax': ### TODO individualized softmax coefficients ### raise 'Softmax weights not supported yet' else: ### TODO I'm assuming here that the alignment method is 'both', so everything's double ### TODO all_coefs can be computed outside the iteration loop # format: src_size_0, ..., src_size_k-1, trg_size_0, ..., trg_size_k-1 ncopies = args.max_align cutoffs = list(range(src_size*ncopies)[::src_size]) \ + list(range(src_size*ncopies,(src_size+trg_size)*ncopies)[::trg_size]) if args.align_weight == 'rr': coefs = [1. / (k+1) for k in range(ncopies)] * 2 else: # 'unit' coefs = [1.] * (ncopies * 2) for cf, co_s, co_e in zip(coefs, cutoffs, cutoffs[1:] + [len(all_coefs)]): all_coefs[co_s:co_e] = cf zr = z[trg_indices] * all_coefs xr = x[src_indices] * all_coefs u, s, vt = xp.linalg.svd(zr.T.dot(xr)) w = vt.T.dot(u.T) x.dot(w, out=xw) zw[:] = z elif args.unconstrained: # unconstrained mapping x_pseudoinv = xp.linalg.inv(x[src_indices].T.dot(x[src_indices])).dot(x[src_indices].T) w = x_pseudoinv.dot(z[trg_indices]) x.dot(w, out=xw) zw[:] = z else: # advanced mapping (default for end, acl2018) # remove lower-rank transformations midpoint = src_size * args.max_align src_indices = xp.concatenate((src_indices[:src_size], src_indices[midpoint:midpoint+trg_size])) trg_indices = xp.concatenate((trg_indices[:src_size], trg_indices[midpoint:midpoint+trg_size])) # TODO xw.dot(wx2, out=xw) and alike not working xw[:] = x zw[:] = z ### TODO entry point for adding more matrix operations ### # STEP 1: Whitening ### TODO figure out how weighted k-best affects this (and onwards) ### def whitening_transformation(m): u, s, vt = xp.linalg.svd(m, full_matrices=False) return vt.T.dot(xp.diag(1/s)).dot(vt) if args.whiten: wx1 = whitening_transformation(xw[src_indices]) wz1 = whitening_transformation(zw[trg_indices]) xw = xw.dot(wx1) zw = zw.dot(wz1) # STEP 2: Orthogonal mapping wx2, s, wz2_t = xp.linalg.svd(xw[src_indices].T.dot(zw[trg_indices])) wz2 = wz2_t.T xw = xw.dot(wx2) zw = zw.dot(wz2) # STEP 3: Re-weighting xw *= s**args.src_reweight zw *= s**args.trg_reweight # STEP 4: De-whitening if args.src_dewhiten == 'src': xw = xw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2)) elif args.src_dewhiten == 'trg': xw = xw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2)) if args.trg_dewhiten == 'src': zw = zw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2)) elif args.trg_dewhiten == 'trg': zw = zw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2)) # STEP 5: Dimensionality reduction (default: OFF (0)) if args.dim_reduction > 0: xw = xw[:, :args.dim_reduction] zw = zw[:, :args.dim_reduction] # Self-learning if end: break else: # Update the training dictionary (default direction - union) if args.direction in ('forward', 'union'): if args.csls_neighborhood > 0: # default acl2018: 10 for i in range(0, trg_size, simbwd.shape[0]): j = min(i + simbwd.shape[0], trg_size) # get next batch to operate on zw[i:j].dot(xw[:src_size].T, out=simbwd[:j-i]) knn_sim_bwd[i:j] = topk_mean(simbwd[:j-i], k=args.csls_neighborhood, inplace=True) for i in range(0, src_size, simfwd.shape[0]): j = min(i + simfwd.shape[0], src_size) xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j-i]) simfwd[:j-i].max(axis=1, out=best_sim_forward[i:j]) simfwd[:j-i] -= knn_sim_bwd/2 # Equivalent to the real CSLS scores for NN # softmaxing #argsimsf[:] = dropout(-simfwd[:j-i], 1 - keep_prob).argsort(axis=1)[:,:args.max_align] for k in range(args.max_align): argsimsf = dropout(simfwd[:j-i], 1 - keep_prob).argmax(axis=1) simfwd[:j-i,argsimsf] = -200 trg_indices_forward[(k*src_size)+i:(k*src_size)+j] = argsimsf #trg_indices_forward[(k*src_size)+i:(k*src_size)+j] = argsimsf[:,k] if args.direction in ('backward', 'union'): if args.csls_neighborhood > 0: for i in range(0, src_size, simfwd.shape[0]): j = min(i + simfwd.shape[0], src_size) # get next batch to operate on xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j-i]) knn_sim_fwd[i:j] = topk_mean(simfwd[:j-i], k=args.csls_neighborhood, inplace=True) for i in range(0, trg_size, simbwd.shape[0]): j = min(i + simbwd.shape[0], trg_size) zw[i:j].dot(xw[:src_size].T, out=simbwd[:j-i]) simbwd[:j-i].max(axis=1, out=best_sim_backward[i:j]) simbwd[:j-i] -= knn_sim_fwd/2 # Equivalent to the real CSLS scores for NN # softmaxing #argsimsb[:] = dropout(-simbwd[:j-i], 1 - keep_prob).argsort(axis=1)[:,:args.max_align] for k in range(args.max_align): argsimsb = dropout(simbwd[:j-i], 1 - keep_prob).argmax(axis=1) simbwd[:j-i,argsimsb] = -200 trg_indices_backward[(k*trg_size)+i:(k*trg_size)+j] = argsimsb #src_indices_backward[(k*trg_size)+i:(k*trg_size)+j] = argsimsb[:,k] if args.direction == 'forward': src_indices = src_indices_forward trg_indices = trg_indices_forward elif args.direction == 'backward': src_indices = src_indices_backward trg_indices = trg_indices_backward elif args.direction == 'union': src_indices = xp.concatenate((src_indices_forward, src_indices_backward)) trg_indices = xp.concatenate((trg_indices_forward, trg_indices_backward)) # Objective function evaluation if args.direction == 'forward': objective = xp.mean(best_sim_forward).tolist() elif args.direction == 'backward': objective = xp.mean(best_sim_backward).tolist() elif args.direction == 'union': # default objective = (xp.mean(best_sim_forward) + xp.mean(best_sim_backward)).tolist() / 2 if objective - best_objective >= args.threshold: last_improvement = it best_objective = objective # Accuracy and similarity evaluation in validation (default - off) if args.validation is not None: src = list(validation.keys()) xw[src].dot(zw.T, out=simval) nn = asnumpy(simval.argmax(axis=1)) accuracy = np.mean([1 if nn[i] in validation[src[i]] else 0 for i in range(len(src))]) similarity = np.mean([max([simval[i, j].tolist() for j in validation[src[i]]]) for i in range(len(src))]) # Logging duration = time.time() - t if args.verbose: print(file=sys.stderr) print('ITERATION {0} ({1:.2f}s)'.format(it, duration), file=sys.stderr) print('\t- Objective: {0:9.4f}%'.format(100 * objective), file=sys.stderr) print('\t- Drop probability: {0:9.4f}%'.format(100 - 100*keep_prob), file=sys.stderr) if args.validation is not None: print('\t- Val. similarity: {0:9.4f}%'.format(100 * similarity), file=sys.stderr) print('\t- Val. accuracy: {0:9.4f}%'.format(100 * accuracy), file=sys.stderr) print('\t- Val. coverage: {0:9.4f}%'.format(100 * validation_coverage), file=sys.stderr) sys.stderr.flush() if args.log is not None: val = '{0:.6f}\t{1:.6f}\t{2:.6f}'.format( 100 * similarity, 100 * accuracy, 100 * validation_coverage) if args.validation is not None else '' print('{0}\t{1:.6f}\t{2}\t{3:.6f}'.format(it, 100 * objective, val, duration), file=log) log.flush() t = time.time() it += 1 # Write mapped embeddings srcfile = open(args.src_output, mode='w', encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_output, mode='w', encoding=args.encoding, errors='surrogateescape') embeddings.write(src_words, xw, srcfile) embeddings.write(trg_words, zw, trgfile) srcfile.close() trgfile.close() # Write dictionary dictfile = open(args.dict_output, mode='wb') dictalign = list(zip(src_indices, trg_indices)) pickle.dump(dictalign, dictfile)
#in sentence_file or '5-15' in sentence_file: batch_num = 0 with open(sentence_file, 'r', encoding='utf-8') as fr: print('Processing file', sentence_file, '...') p = hnswlib.Index(space='cosine', dim=dimension) p.init_index(max_elements = num_elements, ef_construction = 2000, M = 80) p.set_ef(1000) # Set number of threads used during batch search/construction # By default using all available cores p.set_num_threads(30) for n_lines in iter(lambda: tuple(islice(fr, batch_size)), ()): sents = list(map(str.strip, n_lines)) sent_id = list(map(lambda x:int(x.split('\t')[0]), sents)) sentences = list(map(lambda x:x.split('\t')[-1], sents)) x, m = data_io.sentences2idx(sentences, words) w = data_io.seq2weight(x, m, weight4ind) # get SIF embedding embedding = SIF_embedding.SIF_embedding(We, x, w, params) # embedding[i,:] is the embedding for sentence i embeddings.normalize(embedding, ["unit", "center"]) p.add_items(embedding,sent_id) print('Finished batch', batch_num, '.', end = '\r') batch_num += 1 print('\nFinished loading', sentence_file, '.') out_file = sentence_file+'.ann' p.save_index(out_file) print('Finished saving', out_file, '.') del p
def main(): # Parse command line arguments parser = argparse.ArgumentParser(description='Map word embeddings in two languages into a shared space') parser.add_argument('src_input', help='the input source embeddings') parser.add_argument('trg_input', help='the input target embeddings') parser.add_argument('src_output', help='the output source embeddings') parser.add_argument('trg_output', help='the output target embeddings') parser.add_argument('--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp32', help='the floating-point precision (defaults to fp32)') parser.add_argument('--batch_size', default=10000, type=int, help='batch size (defaults to 10000); does not affect results, larger is usually faster but uses more memory') parser.add_argument('--seed', type=int, default=0, help='the random seed (defaults to 0)') recommended_group = parser.add_argument_group('recommended settings', 'Recommended settings for different scenarios') recommended_type = recommended_group.add_mutually_exclusive_group() recommended_type.add_argument('--supervised', metavar='DICTIONARY', help='recommended if you have a large training dictionary') recommended_type.add_argument('--semi_supervised', metavar='DICTIONARY', help='recommended if you have a small seed dictionary') recommended_type.add_argument('--identical', action='store_true', help='recommended if you have no seed dictionary but can rely on identical words') recommended_type.add_argument('--unsupervised', action='store_true', help='recommended if you have no seed dictionary and do not want to rely on identical words') init_group = parser.add_argument_group('advanced initialization arguments', 'Advanced initialization arguments') init_type = init_group.add_mutually_exclusive_group() init_type.add_argument('-d', '--init_dictionary', default=sys.stdin.fileno(), metavar='DICTIONARY', help='the training dictionary file (defaults to stdin)') init_type.add_argument('--init_identical', action='store_true', help='use identical words as the seed dictionary') init_type.add_argument('--init_numerals', action='store_true', help='use latin numerals (i.e. words matching [0-9]+) as the seed dictionary') init_type.add_argument('--init_unsupervised', action='store_true', help='use unsupervised initialization') init_group.add_argument('--unsupervised_vocab', type=int, default=0, help='restrict the vocabulary to the top k entries for unsupervised initialization') mapping_group = parser.add_argument_group('advanced mapping arguments', 'Advanced embedding mapping arguments') mapping_group.add_argument('--normalize', choices=['unit', 'center', 'unitdim', 'centeremb', 'none'], nargs='*', default=[], help='the normalization actions to perform in order') mapping_group.add_argument('--whiten', action='store_true', help='whiten the embeddings') mapping_group.add_argument('--src_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the source language embeddings') mapping_group.add_argument('--trg_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the target language embeddings') mapping_group.add_argument('--src_dewhiten', choices=['src', 'trg'], help='de-whiten the source language embeddings') mapping_group.add_argument('--trg_dewhiten', choices=['src', 'trg'], help='de-whiten the target language embeddings') mapping_group.add_argument('--dim_reduction', type=int, default=0, help='apply dimensionality reduction') mapping_type = mapping_group.add_mutually_exclusive_group() mapping_type.add_argument('-c', '--orthogonal', action='store_true', help='use orthogonal constrained mapping') mapping_type.add_argument('-u', '--unconstrained', action='store_true', help='use unconstrained mapping') self_learning_group = parser.add_argument_group('advanced self-learning arguments', 'Advanced arguments for self-learning') self_learning_group.add_argument('--self_learning', action='store_true', help='enable self-learning') self_learning_group.add_argument('--vocabulary_cutoff', type=int, default=0, help='restrict the vocabulary to the top k entries') self_learning_group.add_argument('--direction', choices=['forward', 'backward', 'union'], default='union', help='the direction for dictionary induction (defaults to union)') self_learning_group.add_argument('--csls', type=int, nargs='?', default=0, const=10, metavar='NEIGHBORHOOD_SIZE', dest='csls_neighborhood', help='use CSLS for dictionary induction') self_learning_group.add_argument('--threshold', default=0.000001, type=float, help='the convergence threshold (defaults to 0.000001)') self_learning_group.add_argument('--validation', default=None, metavar='DICTIONARY', help='a dictionary file for validation at each iteration') self_learning_group.add_argument('--stochastic_initial', default=0.1, type=float, help='initial keep probability stochastic dictionary induction (defaults to 0.1)') self_learning_group.add_argument('--stochastic_multiplier', default=2.0, type=float, help='stochastic dictionary induction multiplier (defaults to 2.0)') self_learning_group.add_argument('--stochastic_interval', default=50, type=int, help='stochastic dictionary induction interval (defaults to 50)') self_learning_group.add_argument('--log', help='write to a log file in tsv format at each iteration') self_learning_group.add_argument('-v', '--verbose', action='store_true', help='write log information to stderr at each iteration') args = parser.parse_args() if args.supervised is not None: parser.set_defaults(init_dictionary=args.supervised, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', batch_size=1000) if args.semi_supervised is not None: parser.set_defaults(init_dictionary=args.semi_supervised, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10) if args.identical: parser.set_defaults(init_identical=True, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10) if args.unsupervised: parser.set_defaults(init_unsupervised=True, unsupervised_vocab=4000, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10) args = parser.parse_args() # Check command line arguments if (args.src_dewhiten is not None or args.trg_dewhiten is not None) and not args.whiten: print('ERROR: De-whitening requires whitening first', file=sys.stderr) sys.exit(-1) # Choose the right dtype for the desired precision if args.precision == 'fp16': dtype = 'float16' elif args.precision == 'fp32': dtype = 'float32' elif args.precision == 'fp64': dtype = 'float64' # Read input embeddings srcfile = open(args.src_input, encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_input, encoding=args.encoding, errors='surrogateescape') src_words, x = embeddings.read(srcfile, dtype=dtype) trg_words, z = embeddings.read(trgfile, dtype=dtype) np.random.seed(args.seed) # Build word to index map src_word2ind = {word: i for i, word in enumerate(src_words)} trg_word2ind = {word: i for i, word in enumerate(trg_words)} # STEP 0: Normalization embeddings.normalize(x, args.normalize) embeddings.normalize(z, args.normalize) # Build the seed dictionary src_indices = [] trg_indices = [] if args.init_unsupervised: sim_size = min(x.shape[0], z.shape[0]) if args.unsupervised_vocab <= 0 else min(x.shape[0], z.shape[0], args.unsupervised_vocab) u, s, vt = np.linalg.svd(x[:sim_size], full_matrices=False) xsim = (u*s).dot(u.T) u, s, vt = np.linalg.svd(z[:sim_size], full_matrices=False) zsim = (u*s).dot(u.T) del u, s, vt xsim.sort(axis=1) zsim.sort(axis=1) embeddings.normalize(xsim, args.normalize) embeddings.normalize(zsim, args.normalize) sim = xsim.dot(zsim.T) if args.csls_neighborhood > 0: knn_sim_fwd = topk_mean(sim, k=args.csls_neighborhood) knn_sim_bwd = topk_mean(sim.T, k=args.csls_neighborhood) sim -= knn_sim_fwd[:, np.newaxis]/2 + knn_sim_bwd/2 if args.direction == 'forward': src_indices = np.arange(sim_size) trg_indices = sim.argmax(axis=1) elif args.direction == 'backward': src_indices = sim.argmax(axis=0) trg_indices = np.arange(sim_size) elif args.direction == 'union': src_indices = np.concatenate((np.arange(sim_size), sim.argmax(axis=0))) trg_indices = np.concatenate((sim.argmax(axis=1), np.arange(sim_size))) del xsim, zsim, sim elif args.init_numerals: numeral_regex = re.compile('^[0-9]+$') src_numerals = {word for word in src_words if numeral_regex.match(word) is not None} trg_numerals = {word for word in trg_words if numeral_regex.match(word) is not None} numerals = src_numerals.intersection(trg_numerals) for word in numerals: src_indices.append(src_word2ind[word]) trg_indices.append(trg_word2ind[word]) elif args.init_identical: identical = set(src_words).intersection(set(trg_words)) for word in identical: src_indices.append(src_word2ind[word]) trg_indices.append(trg_word2ind[word]) else: f = open(args.init_dictionary, encoding=args.encoding, errors='surrogateescape') for line in f: src, trg = line.split() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src_indices.append(src_ind) trg_indices.append(trg_ind) except KeyError: print('WARNING: OOV dictionary entry ({0} - {1})'.format(src, trg), file=sys.stderr) # Read validation dictionary if args.validation is not None: f = open(args.validation, encoding=args.encoding, errors='surrogateescape') validation = collections.defaultdict(set) oov = set() vocab = set() for line in f: src, trg = line.split() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] validation[src_ind].add(trg_ind) vocab.add(src) except KeyError: oov.add(src) oov -= vocab # If one of the translation options is in the vocabulary, then the entry is not an oov validation_coverage = len(validation) / (len(validation) + len(oov)) # Create log file if args.log: log = open(args.log, mode='w', encoding=args.encoding, errors='surrogateescape') # Allocate memory xw = np.empty_like(x) zw = np.empty_like(z) src_size = x.shape[0] if args.vocabulary_cutoff <= 0 else min(x.shape[0], args.vocabulary_cutoff) trg_size = z.shape[0] if args.vocabulary_cutoff <= 0 else min(z.shape[0], args.vocabulary_cutoff) simfwd = np.empty((args.batch_size, trg_size), dtype=dtype) simbwd = np.empty((args.batch_size, src_size), dtype=dtype) if args.validation is not None: simval = np.empty((len(validation.keys()), z.shape[0]), dtype=dtype) best_sim_forward = np.full(src_size, -100, dtype=dtype) src_indices_forward = np.arange(src_size) trg_indices_forward = np.zeros(src_size, dtype=int) best_sim_backward = np.full(trg_size, -100, dtype=dtype) src_indices_backward = np.zeros(trg_size, dtype=int) trg_indices_backward = np.arange(trg_size) knn_sim_fwd = np.zeros(src_size, dtype=dtype) knn_sim_bwd = np.zeros(trg_size, dtype=dtype) # Training loop best_objective = objective = -100. it = 1 last_improvement = 0 keep_prob = args.stochastic_initial t = time.time() end = not args.self_learning while True: # Increase the keep probability if we have not improve in args.stochastic_interval iterations if it - last_improvement > args.stochastic_interval: if keep_prob >= 1.0: end = True keep_prob = min(1.0, args.stochastic_multiplier*keep_prob) last_improvement = it # Update the embedding mapping if args.orthogonal or not end: # orthogonal mapping u, s, vt = np.linalg.svd(z[trg_indices].T.dot(x[src_indices])) w = vt.T.dot(u.T) x.dot(w, out=xw) zw[:] = z elif args.unconstrained: # unconstrained mapping x_pseudoinv = np.linalg.inv(x[src_indices].T.dot(x[src_indices])).dot(x[src_indices].T) w = x_pseudoinv.dot(z[trg_indices]) x.dot(w, out=xw) zw[:] = z else: # advanced mapping # TODO xw.dot(wx2, out=xw) and alike not working xw[:] = x zw[:] = z # STEP 1: Whitening def whitening_transformation(m): u, s, vt = np.linalg.svd(m, full_matrices=False) return vt.T.dot(np.diag(1/s)).dot(vt) if args.whiten: wx1 = whitening_transformation(xw[src_indices]) wz1 = whitening_transformation(zw[trg_indices]) xw = xw.dot(wx1) zw = zw.dot(wz1) # STEP 2: Orthogonal mapping wx2, s, wz2_t = np.linalg.svd(xw[src_indices].T.dot(zw[trg_indices])) wz2 = wz2_t.T xw = xw.dot(wx2) zw = zw.dot(wz2) # STEP 3: Re-weighting xw *= s**args.src_reweight zw *= s**args.trg_reweight # STEP 4: De-whitening if args.src_dewhiten == 'src': xw = xw.dot(wx2.T.dot(np.linalg.inv(wx1)).dot(wx2)) elif args.src_dewhiten == 'trg': xw = xw.dot(wz2.T.dot(np.linalg.inv(wz1)).dot(wz2)) if args.trg_dewhiten == 'src': zw = zw.dot(wx2.T.dot(np.linalg.inv(wx1)).dot(wx2)) elif args.trg_dewhiten == 'trg': zw = zw.dot(wz2.T.dot(np.linalg.inv(wz1)).dot(wz2)) # STEP 5: Dimensionality reduction if args.dim_reduction > 0: xw = xw[:, :args.dim_reduction] zw = zw[:, :args.dim_reduction] # Self-learning if end: break else: # Update the training dictionary if args.direction in ('forward', 'union'): if args.csls_neighborhood > 0: for i in range(0, trg_size, simbwd.shape[0]): j = min(i + simbwd.shape[0], trg_size) zw[i:j].dot(xw[:src_size].T, out=simbwd[:j-i]) knn_sim_bwd[i:j] = topk_mean(simbwd[:j-i], k=args.csls_neighborhood, inplace=True) for i in range(0, src_size, simfwd.shape[0]): j = min(i + simfwd.shape[0], src_size) xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j-i]) simfwd[:j-i].max(axis=1, out=best_sim_forward[i:j]) simfwd[:j-i] -= knn_sim_bwd/2 # Equivalent to the real CSLS scores for NN dropout(simfwd[:j-i], 1 - keep_prob).argmax(axis=1, out=trg_indices_forward[i:j]) if args.direction in ('backward', 'union'): if args.csls_neighborhood > 0: for i in range(0, src_size, simfwd.shape[0]): j = min(i + simfwd.shape[0], src_size) xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j-i]) knn_sim_fwd[i:j] = topk_mean(simfwd[:j-i], k=args.csls_neighborhood, inplace=True) for i in range(0, trg_size, simbwd.shape[0]): j = min(i + simbwd.shape[0], trg_size) zw[i:j].dot(xw[:src_size].T, out=simbwd[:j-i]) simbwd[:j-i].max(axis=1, out=best_sim_backward[i:j]) simbwd[:j-i] -= knn_sim_fwd/2 # Equivalent to the real CSLS scores for NN dropout(simbwd[:j-i], 1 - keep_prob).argmax(axis=1, out=src_indices_backward[i:j]) if args.direction == 'forward': src_indices = src_indices_forward trg_indices = trg_indices_forward elif args.direction == 'backward': src_indices = src_indices_backward trg_indices = trg_indices_backward elif args.direction == 'union': src_indices = np.concatenate((src_indices_forward, src_indices_backward)) trg_indices = np.concatenate((trg_indices_forward, trg_indices_backward)) # Objective function evaluation if args.direction == 'forward': objective = np.mean(best_sim_forward).tolist() elif args.direction == 'backward': objective = np.mean(best_sim_backward).tolist() elif args.direction == 'union': objective = (np.mean(best_sim_forward) + np.mean(best_sim_backward)).tolist() / 2 if objective - best_objective >= args.threshold: last_improvement = it best_objective = objective # Accuracy and similarity evaluation in validation if args.validation is not None: src = list(validation.keys()) xw[src].dot(zw.T, out=simval) nn = asnumpy(simval.argmax(axis=1)) accuracy = np.mean([1 if nn[i] in validation[src[i]] else 0 for i in range(len(src))]) similarity = np.mean([max([simval[i, j].tolist() for j in validation[src[i]]]) for i in range(len(src))]) # Logging duration = time.time() - t if args.verbose: print(file=sys.stderr) print('ITERATION {0} ({1:.2f}s)'.format(it, duration), file=sys.stderr) print('\t- Objective: {0:9.4f}%'.format(100 * objective), file=sys.stderr) print('\t- Drop probability: {0:9.4f}%'.format(100 - 100*keep_prob), file=sys.stderr) if args.validation is not None: print('\t- Val. similarity: {0:9.4f}%'.format(100 * similarity), file=sys.stderr) print('\t- Val. accuracy: {0:9.4f}%'.format(100 * accuracy), file=sys.stderr) print('\t- Val. coverage: {0:9.4f}%'.format(100 * validation_coverage), file=sys.stderr) sys.stderr.flush() if args.log is not None: val = '{0:.6f}\t{1:.6f}\t{2:.6f}'.format( 100 * similarity, 100 * accuracy, 100 * validation_coverage) if args.validation is not None else '' print('{0}\t{1:.6f}\t{2}\t{3:.6f}'.format(it, 100 * objective, val, duration), file=log) log.flush() t = time.time() it += 1 # Write mapped embeddings srcfile = open(args.src_output, mode='w', encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_output, mode='w', encoding=args.encoding, errors='surrogateescape') embeddings.write(src_words, xw, srcfile) embeddings.write(trg_words, zw, trgfile) srcfile.close() trgfile.close()
dtype = 'float32' elif args.precision == 'fp64': dtype = 'float64' ''' supervised approach parameter ''' init_dictionary=args.supervised normalize=['unit', 'center', 'unit'] whiten=True src_reweight=0.5 trg_reweight=0.5 src_dewhiten='src' trg_dewhiten='trg' batch_size=1000 # STEP 0: Normalization embeddings.normalize(x, args.normalize) embeddings.normalize(z, args.normalize) # Build the seed dictionary src_indices = [] trg_indices = [] f = open(args.init_dictionary, encoding=args.encoding, errors='surrogateescape') for line in f: src, trg = line.split() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src_indices.append(src_ind) trg_indices.append(trg_ind) except KeyError: print('WARNING: OOV dictionary entry ({0} - {1})'.format(src, trg), file=sys.stderr)
def main(): # Parse command line arguments parser = argparse.ArgumentParser( description='Map word embeddings in two languages into a shared space') parser.add_argument('src_input', help='the input source embeddings') parser.add_argument('trg_input', help='the input target embeddings') parser.add_argument('sense_input', help='the input sense mapping matrix') parser.add_argument('src_output', help='the output source embeddings') parser.add_argument('trg_output', help='the output target embeddings') parser.add_argument('tsns_output', default='tsns.pkl', help='the output target senses pickle file') parser.add_argument( '--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp32', help='the floating-point precision (defaults to fp32)') parser.add_argument('--cuda', action='store_true', help='use cuda (requires cupy)') parser.add_argument('--seed', type=int, default=0, help='the random seed (defaults to 0)') recommended_group = parser.add_argument_group( 'recommended settings', 'Recommended settings for different scenarios') recommended_type = recommended_group.add_mutually_exclusive_group() recommended_type.add_argument( '--unsupervised', action='store_true', help= 'recommended if you have no seed dictionary and do not want to rely on identical words' ) recommended_type.add_argument('--future', action='store_true', help='experiment with stuff') recommended_type.add_argument('--toy', action='store_true', help='experiment with stuff on toy dataset') recommended_type.add_argument('--acl2018', action='store_true', help='reproduce our ACL 2018 system') init_group = parser.add_argument_group( 'advanced initialization arguments', 'Advanced initialization arguments') init_type = init_group.add_mutually_exclusive_group() init_type.add_argument('--init_unsupervised', action='store_true', help='use unsupervised initialization') init_group.add_argument( '--unsupervised_vocab', type=int, default=0, help= 'restrict the vocabulary to the top k entries for unsupervised initialization' ) mapping_group = parser.add_argument_group( 'advanced mapping arguments', 'Advanced embedding mapping arguments') mapping_group.add_argument( '--normalize', choices=['unit', 'center', 'unitdim', 'centeremb', 'none'], nargs='*', default=[], help='the normalization actions to perform in order') mapping_group.add_argument('--whiten', action='store_true', help='whiten the embeddings') mapping_group.add_argument('--src_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the source language embeddings') mapping_group.add_argument('--trg_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the target language embeddings') mapping_group.add_argument('--src_dewhiten', choices=['src', 'trg'], help='de-whiten the source language embeddings') mapping_group.add_argument('--trg_dewhiten', choices=['src', 'trg'], help='de-whiten the target language embeddings') mapping_group.add_argument('--dim_reduction', type=int, default=0, help='apply dimensionality reduction') mapping_type = mapping_group.add_mutually_exclusive_group() mapping_type.add_argument('-c', '--orthogonal', action='store_true', help='use orthogonal constrained mapping') self_learning_group = parser.add_argument_group( 'advanced self-learning arguments', 'Advanced arguments for self-learning') self_learning_group.add_argument( '--vocabulary_cutoff', type=int, default=0, help='restrict the vocabulary to the top k entries') self_learning_group.add_argument( '--threshold', default=0.000001, type=float, help='the convergence threshold (defaults to 0.000001)') self_learning_group.add_argument( '--stochastic_initial', default=0.1, type=float, help= 'initial keep probability stochastic dictionary induction (defaults to 0.1)' ) self_learning_group.add_argument( '--stochastic_multiplier', default=2.0, type=float, help='stochastic dictionary induction multiplier (defaults to 2.0)') self_learning_group.add_argument( '--stochastic_interval', default=50, type=int, help='stochastic dictionary induction interval (defaults to 50)') self_learning_group.add_argument( '--log', default='map.log', help='write to a log file in tsv format at each iteration') self_learning_group.add_argument( '-v', '--verbose', action='store_true', help='write log information to stderr at each iteration') future_group = parser.add_argument_group('experimental arguments', 'Experimental arguments') future_group.add_argument('--skip_top', type=int, default=0, help='Top k words to skip, presumably function') future_group.add_argument( '--start_src', action='store_true', help='Algorithm starts by tuning sense embeddings based on source') future_group.add_argument('--trim_senses', action='store_true', help='Trim sense table to working vocab') future_group.add_argument( '--lamb', type=float, default=0.5, help='Weight hyperparameter for sense alignment objectives') future_group.add_argument('--reglamb', type=float, default=1., help='Lasso regularization hyperparameter') future_group.add_argument( '--ccreglamb', type=float, default=0.1, help='Sense embedding regularization hyperparameter') future_group.add_argument('--inv_delta', type=float, default=0.0001, help='Delta_I added for inverting sense matrix') future_group.add_argument('--lasso_iters', type=int, default=10, help='Number of iterations for LASSO/NMF') future_group.add_argument('--iterations', type=int, default=-1, help='Number of overall model iterations') future_group.add_argument('--trg_batch', type=int, default=5000, help='Batch size for target steps') future_group.add_argument( '--trg_knn', action='store_true', help='Perform target sense mapping by k-nearest neighbors') future_group.add_argument( '--trg_sns_csls', type=int, default=10, help='K-nearest neighbors for CSLS target sense search') future_group.add_argument( '--senses_per_trg', type=int, default=1, help='K-max target sense mapping (default = 1 = off)') future_group.add_argument( '--gd', action='store_true', help='Apply gradient descent for assignment and synset embeddings') future_group.add_argument('--gd_lr', type=float, default=1e-2, help='Learning rate for SGD (default=0.01)') future_group.add_argument('--gd_wd', action='store_true', help='Weight decay in SGD') future_group.add_argument( '--gd_wd_hl', type=int, default=100, help='Weight decay half-life in SGD, default=100') future_group.add_argument( '--gd_clip', type=float, default=5., help='Per-coordinate gradient clipping (default=5)') future_group.add_argument( '--gd_map_steps', type=int, default=1, help='Consecutive steps for each target-sense mapping update phase') future_group.add_argument( '--gd_emb_steps', type=int, default=1, help='Consecutive steps for each sense embedding update phase') future_group.add_argument( '--base_prox_lambda', type=float, default=0.99, help='Lambda for proximal gradient in lasso step') future_group.add_argument( '--prox_decay', action='store_true', help='Multiply proximal lambda by itself each iteration') future_group.add_argument( '--sense_limit', type=float, default=1.1, help= 'Maximum amount of target sense mappings, in terms of source mappings (default=1.1x)' ) future_group.add_argument( '--gold_pairs', help='Gold data for evaluation, if exists (not for tuning)') future_group.add_argument( '--gold_threshold', type=float, default=0.0, help='Threshold for gold mapping (0 is fine if sparse)') future_group.add_argument('--debug', action='store_true') args = parser.parse_args() # pre-setting groups if args.toy: parser.set_defaults(init_unsupervised=True, unsupervised_vocab=4000, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', vocabulary_cutoff=50, trim_senses=True, inv_delta=1., reglamb=0.2, lasso_iters=100, gd_wd=True, log='map-toy.log') if args.unsupervised or args.future: parser.set_defaults(init_unsupervised=True, unsupervised_vocab=4000, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', vocabulary_cutoff=2000, trim_senses=True, gd_wd=True) if args.unsupervised or args.acl2018: parser.set_defaults(init_unsupervised=True, unsupervised_vocab=4000, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', vocabulary_cutoff=20000) args = parser.parse_args() # Check command line arguments if (args.src_dewhiten is not None or args.trg_dewhiten is not None) and not args.whiten: print('ERROR: De-whitening requires whitening first', file=sys.stderr) sys.exit(-1) # Choose the right dtype for the desired precision if args.precision == 'fp16': dtype = 'float16' # many operations not supported by cupy elif args.precision == 'fp32': # default dtype = 'float32' elif args.precision == 'fp64': dtype = 'float64' # Read input embeddings print('reading embeddings...') srcfile = open(args.src_input, encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_input, encoding=args.encoding, errors='surrogateescape') src_words, x = embeddings.read(srcfile, dtype=dtype) trg_words, z = embeddings.read(trgfile, dtype=dtype) print('embeddings read') # Read input source sense mapping print('reading sense mapping') src_senses = pickle.load(open(args.sense_input, 'rb')) if src_senses.shape[0] != x.shape[0]: src_senses = csr_matrix(src_senses.transpose() ) # using non-cuda scipy because of 'inv' impl #src_senses = get_sparse_module(src_senses) print( f'source sense mapping of shape {src_senses.shape} loaded with {src_senses.getnnz()} nonzeros' ) # NumPy/CuPy management if args.cuda: if not supports_cupy(): print('ERROR: Install CuPy for CUDA support', file=sys.stderr) sys.exit(-1) xp = get_cupy() x = xp.asarray(x) z = xp.asarray(z) print('CUDA loaded') else: xp = np xp.random.seed(args.seed) # removed word to index map (only relevant in supervised learning or with validation) # STEP 0: Normalization embeddings.normalize(x, args.normalize) embeddings.normalize(z, args.normalize) print('normalization complete') # removed building the seed dictionary # removed validation step # Create log file if args.log: log = open(args.log, mode='w', encoding=args.encoding, errors='surrogateescape') print(f'logging into {args.log}') # Allocate memory # Initialize the projection matrices W(s) = W(t) = I. xw = xp.empty_like(x) zw = xp.empty_like(z) xw[:] = x zw[:] = z src_size = x.shape[0] if args.vocabulary_cutoff <= 0 else min( x.shape[0] - args.skip_top, args.vocabulary_cutoff) trg_size = z.shape[0] if args.vocabulary_cutoff <= 0 else min( z.shape[0] - args.skip_top, args.vocabulary_cutoff) emb_dim = x.shape[1] cutoff_end = min(src_size + args.skip_top, x.shape[0]) if args.trim_senses: # reshape sense assignment src_senses = src_senses[args.skip_top:cutoff_end] # new columns for words with no senses in original input ### TODO might also need this if not trimming (probably kinda far away) newcols = [csc_matrix(([1],([i],[0])),shape=(src_size,1)) for i in range(src_size)\ if src_senses.getrow(i).getnnz() == 0] #with open(f'data/synsets/dummy_synsets_v3b_{src_size}','wb') as dummy_cols_file: # dummy_col_idcs = [i for i in range(src_size) if src_senses.getrow(i).getnnz() == 0] # pickle.dump(np.array(dummy_col_idcs), dummy_cols_file) # trim senses no longer used, add new ones colsums = src_senses.sum(axis=0).tolist()[0] kept_senses = [i for i, j in enumerate(colsums) if j > 0] #with open(f'data/synsets/kept_synsets_v3b_{src_size}','wb') as kept_save_file: # pickle.dump(np.array(kept_senses), kept_save_file) src_senses = hstack([src_senses[:, kept_senses]] + newcols) print( f'trimmed sense dictionary dimensions: {src_senses.shape} with {src_senses.getnnz()} nonzeros' ) sense_size = src_senses.shape[1] if args.gold_pairs is not None: with open(args.gold_pairs, 'rb') as gold_pairs_f: gold_pairs = pickle.load(gold_pairs_f) gold_pairs = [(i-args.skip_top,j) for i,j in gold_pairs \ if i >= args.skip_top and i < src_senses.shape[0] and j < src_senses.shape[1]] gold_trgs = sorted(set([x[0] for x in gold_pairs])) gold_senses = sorted(set([x[1] for x in gold_pairs])) gold_domain_size = len(gold_trgs) * len(gold_senses) print( f'evaluating on {len(gold_pairs)} pairs with {len(gold_trgs)} unique words and {len(gold_senses)} unique senses' ) # Initialize the concept embeddings from the source embeddings ### TODO maybe try gradient descent instead? ### TODO (pre-)create non-singular alignment matrix cc = xp.empty((sense_size, emb_dim), dtype=dtype) # \tilde{E} t01 = time.time() print('starting psinv calc') src_sns_psinv = psinv(src_senses, dtype, args.inv_delta) xecc = x[args.skip_top:cutoff_end].T.dot( get_sparse_module(src_senses).toarray()).T # sense_size * emb_dim cc[:] = src_sns_psinv.dot(xecc) print(f'initialized concept embeddings in {time.time()-t01:.2f} seconds', file=sys.stderr) if args.verbose: # report precision of psedo-inverse operation, checked by inverting pseudo_id = src_senses.transpose().dot(src_senses).dot( src_sns_psinv.get()) real_id = sparse_id(sense_size) rel_diff = (pseudo_id - real_id).sum() / (sense_size * sense_size) print(f'per-coordinate pseudo-inverse precision is {rel_diff:.5f}') ### TODO initialize trg_senses using seed dictionary instead? trg_sns_size = trg_size if args.trim_senses else z.shape[0] trg_senses = csr_matrix( (trg_sns_size, sense_size)) # using non-cuda scipy because of 'inv' impl zecc = xp.empty_like(xecc) # sense_size * emb_dim #tg_grad = xp.empty((trg_sns_size, sense_size)) if args.gd: # everything can be done on gpu src_senses = get_sparse_module(src_senses, dtype=dtype) trg_senses = get_sparse_module(trg_senses, dtype=dtype) if args.sense_limit > 0.0: trg_sense_limit = int(args.sense_limit * src_senses.getnnz()) if args.verbose: print( f'limiting target side to {trg_sense_limit} sense mappings' ) else: trg_sense_limit = -1 ### TODO return memory assignment for similarities? # Training loop if args.gd: prox_lambda = args.base_prox_lambda else: lasso_model = Lasso(alpha=args.reglamb, fit_intercept=False, max_iter=args.lasso_iters,\ positive=True, warm_start=True) # TODO more parametrization if args.log is not None: if args.gd: print(f'gradient descent lr: {args.gd_lr}', file=log) print(f'base proximal lambda: {args.base_prox_lambda}', file=log) else: print(f'lasso regularization: {args.reglamb}', file=log) print(f'lasso iterations: {args.lasso_iters}', file=log) print(f'inversion epsilon: {args.inv_delta}', file=log) if args.gold_pairs is not None: print(f'gold mappings: {len(gold_pairs)}', file=log) print( f'Iteration\tObjective\tSource\tTarget\tL_1\tDuration\tNonzeros\tCorrect_mappings', file=log) log.flush() best_objective = objective = 1000000000. correct_mappings = -1 regularization_lambda = args.base_prox_lambda if args.gd else args.reglamb it = 1 last_improvement = 0 t = time.time() map_gd_lr = args.gd_lr emb_gd_lr = args.gd_lr end = False print('starting training') if args.start_src: print('starting with converging synset embeddings') it_range = range( args.iterations ) ### TODO possibly add arg, but there's early stopping if not args.verbose: it_range = tqdm(it_range) prev_obj = float('inf') for pre_it in it_range: if args.gd_wd: emb_gd_lr = args.gd_lr * pow(0.5, floor( pre_it / args.gd_wd_hl)) # Synset embedding cc_grad = src_senses.T.dot( xw[args.skip_top:cutoff_end] - src_senses.dot(cc)) - args.ccreglamb * cc cc_grad.clip(-args.gd_clip, args.gd_clip, out=cc_grad) cc += emb_gd_lr * cc_grad # Source projection u, s, vt = xp.linalg.svd(cc.T.dot(xecc)) wx = vt.T.dot(u.T).astype(dtype) x.dot(wx, out=xw) pre_objective = ((xp.linalg.norm( xw[args.skip_top:cutoff_end] - get_sparse_module(src_senses).dot(cc), 'fro'))**2) / 2 pre_objective = float(pre_objective) if args.verbose and pre_it > 0 and pre_it % 10 == 0: print( f'source synset embedding objective iteration {pre_it}: {pre_objective:.3f}' ) if pre_objective > prev_obj: print( f'stopping at pre-iteration {pre_it}, source-sense objective {prev_obj:.3f}' ) # revert cc -= emb_gd_lr * cc_grad break prev_obj = pre_objective while True: if it % 50 == 0: print( f'starting iteration {it}, last objective was {objective}, correct mappings at {correct_mappings}' ) # Increase the keep probability if we have not improved in args.stochastic_interval iterations if it - last_improvement > args.stochastic_interval: last_improvement = it if args.iterations > 0 and it > args.iterations: end = True ### update target assignments (6) - lasso-esque regression time6 = time.time() # optimize: 0.5 * (xp.linalg.norm(zw[i] - trg_senses[i].dot(cc))^2) + (regularization_lambda * xp.linalg.norm(trg_senses[i],1)) if args.trg_knn: # for csls-based neighborhoods knn_sense = xp.full(sense_size, -100) for i in range(0, sense_size, args.trg_batch): batch_end = min(i + args.trg_batch, sense_size) sim_sense_trg = cc[i:batch_end].dot( zw[args.skip_top:cutoff_end].T) knn_sense[i:batch_end] = topk_mean(sim_sense_trg, k=args.trg_sns_csls, inplace=True) # calculate new target mappings trg_senses = lil_matrix(trg_senses.shape) for i in range(0, trg_size, args.trg_batch): sns_batch_end = min(i + args.trg_batch, trg_size) z_i = i + args.skip_top z_batch_end = min(sns_batch_end + args.skip_top, zw.shape[0]) sims = zw[z_i:z_batch_end].dot(cc.T) sims -= knn_sense / 2 # equivalent to the real CSLS scores for NN best_idcs = sims.argmax(1).tolist() trg_senses[(list(range(i, sns_batch_end)), best_idcs)] = sims.max(1).tolist() # second-to-lth-best for l in range(args.senses_per_trg - 1): sims[(list(range(sims.shape[0])), best_idcs)] = 0. best_idcs = sims.argmax(1).tolist() trg_senses[(list(range(i, sns_batch_end)), best_idcs)] = sims.max(1).tolist() trg_senses = get_sparse_module(trg_senses.tocsr()) elif args.gd: ### TODO add args.skip_top calculations if args.gd_wd: true_it = (it - 1) * args.gd_map_steps map_gd_lr = args.gd_lr * pow( 0.5, floor((1 + true_it) / args.gd_wd_hl)) if args.verbose: print(f'mapping learning rate: {map_gd_lr}') for k in range(args.gd_map_steps): # st <- st + eta * (ew - st.dot(es)).dot(es.T) # allow up to sense_limit updates, clip gradient batch_grads = [] for i in range(0, trg_size, args.trg_batch): batch_end = min(i + args.trg_batch, trg_size) tg_grad_b = (zw[i:batch_end] - trg_senses[i:batch_end].dot(cc)).dot(cc.T) # proximal gradient tg_grad_b += prox_lambda tg_grad_b.clip(None, 0.0, out=tg_grad_b) batch_grads.append(batch_sparse(tg_grad_b)) tg_grad = get_sparse_module(vstack(batch_grads)) del tg_grad_b if args.prox_decay: prox_lambda *= args.base_prox_lambda ### TODO consider weight decay here as well (args.gd_wd) trg_senses -= map_gd_lr * tg_grad # allow up to sense_limit nonzeros if trg_sense_limit > 0: trg_senses = trim_sparse(trg_senses, trg_sense_limit, clip=None) ### TODO consider finishing up with lasso (maybe only in final iteration) else: ### TODO add args.skip_top calculations # parallel LASSO (no cuda impl) cccpu = cc.get().T # emb_dim * sense_size lasso_model.fit(cccpu, zw[:trg_size].get().T) ### TODO maybe trim, keep only above some threshold (0.05) OR top f(#it) trg_senses = lasso_model.sparse_coef_ if args.verbose: print( f'target sense mapping step: {(time.time()-time6):.2f} seconds, {trg_senses.getnnz()} nonzeros', file=sys.stderr) objective = ((xp.linalg.norm(xw[args.skip_top:cutoff_end] - get_sparse_module(src_senses).dot(cc),'fro') ** 2)\ + (xp.linalg.norm(zw[args.skip_top:cutoff_end] - get_sparse_module(trg_senses).dot(cc),'fro')) ** 2) / 2 \ + regularization_lambda * trg_senses.sum() # TODO consider thresholding reg part objective = float(objective) print(f'objective: {objective:.3f}') # Write target sense mapping with open(f'tmp_outs/{args.tsns_output[:-4]}-it{it:03d}.pkl', mode='wb') as tsnsfile: pickle.dump(trg_senses.get(), tsnsfile) ### update synset embeddings (10) time10 = time.time() if args.gd and args.gd_emb_steps > 0: ### TODO probably handle sizes and/or threshold sparse matrix if args.gd_wd: true_it = (it - 1) * args.gd_emb_steps emb_gd_lr = args.gd_lr * pow( 0.5, floor((1 + true_it) / args.gd_wd_hl)) if args.verbose: print(f'embedding learning rate: {emb_gd_lr}') ### replace block for no-source-tuning mode all_senses = trg_senses if args.start_src else get_sparse_module( vstack((src_senses.get(), trg_senses.get()), format='csr'), dtype=dtype) aw = zw[args. skip_top:cutoff_end] if args.start_src else xp.concatenate( (xw[args.skip_top:cutoff_end], zw[args.skip_top:cutoff_end])) for i in range(args.gd_emb_steps): cc_grad = all_senses.T.dot( aw - all_senses.dot(cc)) - args.ccreglamb * cc cc_grad.clip(-args.gd_clip, args.gd_clip, out=cc_grad) cc += emb_gd_lr * cc_grad else: ### TODO add args.skip_top calculations all_senses = get_sparse_module( vstack((src_senses, trg_senses), format='csr')) xzecc = xp.concatenate((xw[:src_size], zw[:trg_size])).T\ .dot(all_senses.toarray()).T # sense_size * emb_dim all_sns_psinv = psinv( all_senses.get(), dtype, args.inv_delta ) ### TODO only update target side? We still have src_sns_psinv [it doesn't matter, dimensions are the same] cc[:] = all_sns_psinv.dot(xzecc) if args.verbose: print(f'synset embedding update: {time.time()-time10:.2f}', file=sys.stderr) objective = ((xp.linalg.norm(xw[args.skip_top:cutoff_end] - get_sparse_module(src_senses).dot(cc),'fro')) ** 2\ + (xp.linalg.norm(zw[args.skip_top:cutoff_end] - get_sparse_module(trg_senses).dot(cc),'fro')) ** 2) / 2 \ + regularization_lambda * trg_senses.sum() # TODO consider thresholding reg part objective = float(objective) print(f'objective: {objective:.3f}') ### update projections (3,5) # write to zw and xw if args.orthogonal or not end: ### remove block for no-source-tuning mode # source side - mappings don't change so xecc is constant #if not args.start_src: # need to do this anyway whenever cc updates time3 = time.time() u, s, vt = xp.linalg.svd(cc.T.dot(xecc)) wx = vt.T.dot(u.T).astype(dtype) x.dot(wx, out=xw) if args.verbose: print(f'source projection update: {time.time()-time3:.2f}', file=sys.stderr) # target side - compute sense mapping first time3 = time.time() zecc.fill(0.) for i in range(0, trg_size, args.trg_batch): end_idx = min(i + args.trg_batch, trg_size) zecc += z[i:end_idx].T.dot( get_sparse_module(trg_senses[i:end_idx]).toarray()).T u, s, vt = xp.linalg.svd(cc.T.dot(zecc)) wz = vt.T.dot(u.T).astype(dtype) z.dot(wz, out=zw) if args.verbose: print(f'target projection update: {time.time()-time3:.2f}', file=sys.stderr) ### TODO add parts from 'advanced mapping' part - transformations, whitening, etc. # Objective function evaluation time_obj = time.time() trg_senses_l1 = float(trg_senses.sum()) src_obj = (float( xp.linalg.norm( xw[args.skip_top:cutoff_end] - get_sparse_module(src_senses).dot(cc), 'fro'))**2) / 2 trg_obj = (float( xp.linalg.norm( zw[args.skip_top:cutoff_end] - get_sparse_module(trg_senses).dot(cc), 'fro'))**2) / 2 objective = src_obj + trg_obj + regularization_lambda * trg_senses_l1 # TODO consider thresholding reg part if args.verbose: print(f'objective calculation: {time.time()-time_obj:.2f}', file=sys.stderr) if objective - best_objective <= -args.threshold: last_improvement = it best_objective = objective # WordNet transduction evaluation (can't tune on this) if args.gold_pairs is not None: np_trg_senses = trg_senses.get() trg_corr = [ p for p in gold_pairs if np_trg_senses[p] > args.gold_threshold ] correct_mappings = len(trg_corr) domain_trgs = np_trg_senses[gold_trgs][:, gold_senses] else: correct_mappings = -1 # Logging duration = time.time() - t if args.verbose: print('ITERATION {0} ({1:.2f}s)'.format(it, duration), file=sys.stderr) print('objective: {0:.3f}'.format(objective), file=sys.stderr) print('target senses l_1 norm: {0:.3f}'.format(trg_senses_l1), file=sys.stderr) if len(gold_pairs) > 0 and domain_trgs.getnnz() > 0: print( f'{correct_mappings} correct target mappings: {(correct_mappings/len(gold_pairs)):.3f} recall, {(correct_mappings/domain_trgs.getnnz()):.3f} precision', file=sys.stderr) print(file=sys.stderr) sys.stderr.flush() if args.log is not None: print( f'{it}\t{objective:.3f}\t{src_obj:.3f}\t{trg_obj:.3f}\t{trg_senses_l1:.3f}\t{duration:.3f}\t{trg_senses.getnnz()}\t{correct_mappings}', file=log) log.flush() if end: break t = time.time() it += 1 # Write mapped embeddings with open(args.src_output, mode='w', encoding=args.encoding, errors='surrogateescape') as srcfile: embeddings.write(src_words, xw, srcfile) with open(args.trg_output, mode='w', encoding=args.encoding, errors='surrogateescape') as trgfile: embeddings.write(trg_words, zw, trgfile) # Write target sense mapping with open(args.tsns_output, mode='wb') as tsnsfile: pickle.dump(trg_senses.get(), tsnsfile)
def main(): # Parse command line arguments # https://docs.python.org/3/library/argparse.html parser = argparse.ArgumentParser( description='Map word embeddings in two languages into a shared space') # description - This argument gives a brief description of what the program does and how it works. parser.add_argument('src_input', help='the input source embeddings') # help - A brief description of what the argument does. parser.add_argument('trg_input', help='the input target embeddings') parser.add_argument('src_output', help='the output source embeddings') parser.add_argument('trg_output', help='the output target embeddings') parser.add_argument( '--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') # -- optional # default - The value produced if the argument is absent from the command line. parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp32', help='the floating-point precision (defaults to fp32)') # choices - A container of the allowable values for the argument. parser.add_argument('--cuda', action='store_true', help='use cuda (requires cupy)') # action - The basic type of action to be taken when this argument is encountered at the command line. # store_ture - store true value parser.add_argument( '--batch_size', default=1000, type=int, help= 'batch size (defaults to 10000); does not affect results, larger is usually faster but uses more memory' ) parser.add_argument('--seed', type=int, default=0, help='the random seed (defaults to 0)') parser.add_argument('--draw', action='store_true', help='use seaborn to draw') recommended_group = parser.add_argument_group( 'recommended settings', 'Recommended settings for different scenarios') # add_argument_group() - returns an argument group object which has an add_argument() method just like a regular ArgumentParser. # it's a better conceptual grouping of arguments than this default one recommended_type = recommended_group.add_mutually_exclusive_group() # argparse will make sure that only one of the arguments in the mutually exclusive group was present on the command line recommended_type.add_argument( '--supervised', metavar='DICTIONARY', help='recommended if you have a large training dictionary') recommended_type.add_argument( '--semi_supervised', metavar='DICTIONARY', help='recommended if you have a small seed dictionary') recommended_type.add_argument( '--identical', action='store_true', help= 'recommended if you have no seed dictionary but can rely on identical words' ) recommended_type.add_argument( '--unsupervised', action='store_true', help= 'recommended if you have no seed dictionary and do not want to rely on identical words' ) recommended_type.add_argument('--acl2018', action='store_true', help='reproduce our ACL 2018 system') recommended_type.add_argument('--aaai2018', metavar='DICTIONARY', help='reproduce our AAAI 2018 system') # A name for the argument in usage messages recommended_type.add_argument( '--acl2017', action='store_true', help='reproduce our ACL 2017 system with numeral initialization') recommended_type.add_argument( '--acl2017_seed', metavar='DICTIONARY', help='reproduce our ACL 2017 system with a seed dictionary') recommended_type.add_argument('--emnlp2016', metavar='DICTIONARY', help='reproduce our EMNLP 2016 system') init_group = parser.add_argument_group( 'advanced initialization arguments', 'Advanced initialization arguments') init_type = init_group.add_mutually_exclusive_group() init_type.add_argument( '-d', '--init_dictionary', default=sys.stdin.fileno(), metavar='DICTIONARY', help='the training dictionary file (defaults to stdin)') init_type.add_argument('--init_identical', action='store_true', help='use identical words as the seed dictionary') init_type.add_argument( '--init_numerals', action='store_true', help= 'use latin numerals (i.e. words matching [0-9]+) as the seed dictionary' ) init_type.add_argument('--init_unsupervised', action='store_true', help='use unsupervised initialization') init_group.add_argument( '--unsupervised_vocab', type=int, default=0, help= 'restrict the vocabulary to the top k entries for unsupervised initialization' ) mapping_group = parser.add_argument_group( 'advanced mapping arguments', 'Advanced embedding mapping arguments') mapping_group.add_argument( '--normalize', choices=['unit', 'center', 'unitdim', 'centeremb', 'none'], nargs='*', default=[], help='the normalization actions to perform in order') # no normalization in default mapping_group.add_argument('--whiten', action='store_true', help='whiten the embeddings') mapping_group.add_argument('--src_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the source language embeddings') mapping_group.add_argument('--trg_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the target language embeddings') mapping_group.add_argument('--src_dewhiten', choices=['src', 'trg'], help='de-whiten the source language embeddings') mapping_group.add_argument('--trg_dewhiten', choices=['src', 'trg'], help='de-whiten the target language embeddings') mapping_group.add_argument('--dim_reduction', type=int, default=0, help='apply dimensionality reduction') mapping_type = mapping_group.add_mutually_exclusive_group() mapping_type.add_argument('-c', '--orthogonal', action='store_true', help='use orthogonal constrained mapping') mapping_type.add_argument('-u', '--unconstrained', action='store_true', help='use unconstrained mapping') self_learning_group = parser.add_argument_group( 'advanced self-learning arguments', 'Advanced arguments for self-learning') self_learning_group.add_argument('--self_learning', action='store_true', help='enable self-learning') self_learning_group.add_argument( '--vocabulary_cutoff', type=int, default=0, help='restrict the vocabulary to the top k entries') self_learning_group.add_argument( '--direction', choices=['forward', 'backward', 'union'], default='union', help='the direction for dictionary induction (defaults to union)') self_learning_group.add_argument('--csls', type=int, nargs='?', default=0, const=10, metavar='NEIGHBORHOOD_SIZE', dest='csls_neighborhood', help='use CSLS for dictionary induction') self_learning_group.add_argument( '--threshold', default=0.000001, type=float, help='the convergence threshold (defaults to 0.000001)') self_learning_group.add_argument( '--validation', default=None, metavar='DICTIONARY', help='a dictionary file for validation at each iteration') self_learning_group.add_argument( '--stochastic_initial', default=0.1, type=float, help= 'initial keep probability stochastic dictionary induction (defaults to 0.1)' ) self_learning_group.add_argument( '--stochastic_multiplier', default=2.0, type=float, help='stochastic dictionary induction multiplier (defaults to 2.0)') self_learning_group.add_argument( '--stochastic_interval', default=50, type=int, help='stochastic dictionary induction interval (defaults to 50)') self_learning_group.add_argument( '--log', help='write to a log file in tsv format at each iteration') self_learning_group.add_argument( '-v', '--verbose', action='store_true', help='write log information to stderr at each iteration') args = parser.parse_args() if args.supervised is not None: parser.set_defaults(init_dictionary=args.supervised, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', batch_size=1000) if args.semi_supervised is not None: parser.set_defaults(init_dictionary=args.semi_supervised, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10) if args.identical: parser.set_defaults(init_identical=True, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10) if args.unsupervised or args.acl2018: parser.set_defaults(init_unsupervised=True, unsupervised_vocab=4000, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10) if args.aaai2018: parser.set_defaults(init_dictionary=args.aaai2018, normalize=['unit', 'center'], whiten=True, trg_reweight=1, src_dewhiten='src', trg_dewhiten='trg', batch_size=1000) if args.acl2017: parser.set_defaults(init_numerals=True, orthogonal=True, normalize=['unit', 'center'], self_learning=True, direction='forward', stochastic_initial=1.0, stochastic_interval=1, batch_size=1000) if args.acl2017_seed: parser.set_defaults(init_dictionary=args.acl2017_seed, orthogonal=True, normalize=['unit', 'center'], self_learning=True, direction='forward', stochastic_initial=1.0, stochastic_interval=1, batch_size=1000) if args.emnlp2016: parser.set_defaults(init_dictionary=args.emnlp2016, orthogonal=True, normalize=['unit', 'center'], batch_size=1000) args = parser.parse_args() # Check command line arguments if (args.src_dewhiten is not None or args.trg_dewhiten is not None) and not args.whiten: print('ERROR: De-whitening requires whitening first', file=sys.stderr) sys.exit(-1) # Choose the right dtype for the desired precision if args.precision == 'fp16': dtype = 'float16' elif args.precision == 'fp32': dtype = 'float32' elif args.precision == 'fp64': dtype = 'float64' # Read input embeddings srcfile = open(args.src_input, encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_input, encoding=args.encoding, errors='surrogateescape') src_words, x = embeddings.read(srcfile, dtype=dtype) trg_words, z = embeddings.read(trgfile, dtype=dtype) # NumPy/CuPy management if args.cuda: if not supports_cupy(): print('ERROR: Install CuPy for CUDA support', file=sys.stderr) sys.exit(-1) xp = get_cupy() x = xp.asarray(x) z = xp.asarray(z) else: xp = np # fix random seed xp.random.seed(args.seed) # Build word to index map src_word2ind = {word: i for i, word in enumerate(src_words)} trg_word2ind = {word: i for i, word in enumerate(trg_words)} # STEP 0: Normalization embeddings.normalize(x, args.normalize) embeddings.normalize(z, args.normalize) # Build the seed dictionary src_indices = [] trg_indices = [] dict_size = 5000 if args.init_unsupervised: sim_size = min(x.shape[0], z.shape[0]) if args.unsupervised_vocab <= 0 else min( x.shape[0], z.shape[0], args.unsupervised_vocab) u, s, vt = xp.linalg.svd(x[:sim_size], full_matrices=False) xsim = (u * s).dot(u.T) u, s, vt = xp.linalg.svd(z[:sim_size], full_matrices=False) zsim = (u * s).dot(u.T) del u, s, vt xsim.sort(axis=1) zsim.sort(axis=1) embeddings.normalize(xsim, args.normalize) embeddings.normalize(zsim, args.normalize) sim = xsim.dot(zsim.T) if args.csls_neighborhood > 0: knn_sim_fwd = topk_mean(sim, k=args.csls_neighborhood) knn_sim_bwd = topk_mean(sim.T, k=args.csls_neighborhood) sim -= knn_sim_fwd[:, xp.newaxis] / 2 + knn_sim_bwd / 2 if args.direction == 'forward': src_indices = xp.arange(sim_size) trg_indices = sim.argmax(axis=1) elif args.direction == 'backward': src_indices = sim.argmax(axis=0) trg_indices = xp.arange(sim_size) elif args.direction == 'union': src_indices = xp.concatenate( (xp.arange(sim_size), sim.argmax(axis=0))) trg_indices = xp.concatenate( (sim.argmax(axis=1), xp.arange(sim_size))) del xsim, zsim, sim elif args.init_numerals: numeral_regex = re.compile('^[0-9]+$') # ^ match from start of words $ match to end of words # consider numbers from 0 to 9 # http://www.runoob.com/python/python-reg-expressions.html src_numerals = { word for word in src_words if numeral_regex.match(word) is not None } trg_numerals = { word for word in trg_words if numeral_regex.match(word) is not None } numerals = src_numerals.intersection(trg_numerals) for word in numerals: src_indices.append(src_word2ind[word]) trg_indices.append(trg_word2ind[word]) elif args.init_identical: identical = set(src_words).intersection(set(trg_words)) for word in identical: src_indices.append(src_word2ind[word]) trg_indices.append(trg_word2ind[word]) else: f = open(args.init_dictionary, encoding=args.encoding, errors='surrogateescape') for line in f: src, trg = line.split() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src_indices.append(src_ind) trg_indices.append(trg_ind) except KeyError: print('WARNING: OOV dictionary entry ({0} - {1})'.format( src, trg), file=sys.stderr) if len(src_indices) == dict_size: break # Read validation dictionary if args.validation is not None: f = open(args.validation, encoding=args.encoding, errors='surrogateescape') validation = collections.defaultdict(set) oov = set() vocab = set() for line in f: src, trg = line.split() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] validation[src_ind].add(trg_ind) vocab.add(src) except KeyError: oov.add(src) oov -= vocab # If one of the translation options is in the vocabulary, then the entry is not an oov validation_coverage = len(validation) / (len(validation) + len(oov)) # Create log file if args.log: log = open(args.log, mode='w', encoding=args.encoding, errors='surrogateescape') # Allocate memory xw = xp.empty_like(x) zw = xp.empty_like(z) # choose to cut-off or not src_size = x.shape[0] if args.vocabulary_cutoff <= 0 else min( x.shape[0], args.vocabulary_cutoff) trg_size = z.shape[0] if args.vocabulary_cutoff <= 0 else min( z.shape[0], args.vocabulary_cutoff) simfwd = xp.empty((args.batch_size, trg_size), dtype=dtype) simbwd = xp.empty((args.batch_size, src_size), dtype=dtype) if args.validation is not None: simval = xp.empty((len(validation.keys()), z.shape[0]), dtype=dtype) best_sim_forward = xp.full(src_size, -100, dtype=dtype) src_indices_forward = xp.arange(src_size) trg_indices_forward = xp.zeros(src_size, dtype=int) best_sim_backward = xp.full(trg_size, -100, dtype=dtype) src_indices_backward = xp.zeros(trg_size, dtype=int) trg_indices_backward = xp.arange(trg_size) knn_sim_fwd = xp.zeros(src_size, dtype=dtype) knn_sim_bwd = xp.zeros(trg_size, dtype=dtype) # Training loop best_objective = objective = -100. it = 1 last_improvement = 0 keep_prob = args.stochastic_initial t = time.time() end = not args.self_learning while True: # Increase the keep probability if we have not improve in args.stochastic_interval iterations # for init-numeral : if objective doesn's increase after 1 iteration, then stop it directly if it - last_improvement > args.stochastic_interval: if keep_prob >= 1.0: end = True keep_prob = min(1.0, args.stochastic_multiplier * keep_prob) last_improvement = it # Update the embedding mapping if args.orthogonal or not end: # orthogonal mapping u, s, vt = xp.linalg.svd(z[trg_indices].T.dot(x[src_indices])) w = vt.T.dot(u.T) x.dot(w, out=xw) zw[:] = z elif args.unconstrained: # unconstrained mapping x_pseudoinv = xp.linalg.inv(x[src_indices].T.dot( x[src_indices])).dot(x[src_indices].T) w = x_pseudoinv.dot(z[trg_indices]) x.dot(w, out=xw) zw[:] = z else: # advanced mapping # TODO xw.dot(wx2, out=xw) and alike not working xw[:] = x zw[:] = z # STEP 1: Whitening def whitening_transformation(m): u, s, vt = xp.linalg.svd(m, full_matrices=False) return vt.T.dot(xp.diag(1 / s)).dot(vt) if args.whiten: wx1 = whitening_transformation(xw[src_indices]) wz1 = whitening_transformation(zw[trg_indices]) xw = xw.dot(wx1) zw = zw.dot(wz1) # STEP 2: Orthogonal mapping wx2, s, wz2_t = xp.linalg.svd(xw[src_indices].T.dot( zw[trg_indices])) wz2 = wz2_t.T xw = xw.dot(wx2) zw = zw.dot(wz2) # STEP 3: Re-weighting xw *= s**args.src_reweight zw *= s**args.trg_reweight # STEP 4: De-whitening if args.src_dewhiten == 'src': xw = xw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2)) elif args.src_dewhiten == 'trg': xw = xw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2)) if args.trg_dewhiten == 'src': zw = zw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2)) elif args.trg_dewhiten == 'trg': zw = zw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2)) # STEP 5: Dimensionality reduction if args.dim_reduction > 0: xw = xw[:, :args.dim_reduction] zw = zw[:, :args.dim_reduction] # Self-learning if end: break else: # Update the training dictionary if args.direction in ('forward', 'union'): if args.csls_neighborhood > 0: for i in range(0, trg_size, simbwd.shape[0]): j = min(i + simbwd.shape[0], trg_size) zw[i:j].dot(xw[:src_size].T, out=simbwd[:j - i]) knn_sim_bwd[i:j] = topk_mean(simbwd[:j - i], k=args.csls_neighborhood, inplace=True) for i in range(0, src_size, simfwd.shape[0]): j = min(i + simfwd.shape[0], src_size) xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j - i]) simfwd[:j - i].max(axis=1, out=best_sim_forward[i:j]) simfwd[:j - i] -= knn_sim_bwd / 2 # Equivalent to the real CSLS scores for NN dropout(simfwd[:j - i], 1 - keep_prob).argmax(axis=1, out=trg_indices_forward[i:j]) if args.direction in ('backward', 'union'): if args.csls_neighborhood > 0: for i in range(0, src_size, simfwd.shape[0]): j = min(i + simfwd.shape[0], src_size) xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j - i]) knn_sim_fwd[i:j] = topk_mean(simfwd[:j - i], k=args.csls_neighborhood, inplace=True) for i in range(0, trg_size, simbwd.shape[0]): j = min(i + simbwd.shape[0], trg_size) zw[i:j].dot(xw[:src_size].T, out=simbwd[:j - i]) simbwd[:j - i].max(axis=1, out=best_sim_backward[i:j]) simbwd[:j - i] -= knn_sim_fwd / 2 # Equivalent to the real CSLS scores for NN dropout(simbwd[:j - i], 1 - keep_prob).argmax( axis=1, out=src_indices_backward[i:j]) if args.direction == 'forward': src_indices = src_indices_forward trg_indices = trg_indices_forward elif args.direction == 'backward': src_indices = src_indices_backward trg_indices = trg_indices_backward elif args.direction == 'union': src_indices = xp.concatenate( (src_indices_forward, src_indices_backward)) trg_indices = xp.concatenate( (trg_indices_forward, trg_indices_backward)) # Objective function evaluation if args.direction == 'forward': objective = xp.mean(best_sim_forward).tolist() elif args.direction == 'backward': objective = xp.mean(best_sim_backward).tolist() elif args.direction == 'union': objective = (xp.mean(best_sim_forward) + xp.mean(best_sim_backward)).tolist() / 2 if objective - best_objective >= args.threshold: last_improvement = it best_objective = objective # Accuracy and similarity evaluation in validation if args.validation is not None: src = list(validation.keys()) xw[src].dot(zw.T, out=simval) nn = asnumpy(simval.argmax(axis=1)) accuracy = np.mean([ 1 if nn[i] in validation[src[i]] else 0 for i in range(len(src)) ]) similarity = np.mean([ max([simval[i, j].tolist() for j in validation[src[i]]]) for i in range(len(src)) ]) # Logging duration = time.time() - t if args.verbose: print(file=sys.stderr) print('ITERATION {0} ({1:.2f}s)'.format(it, duration), file=sys.stderr) print('\t- Objective: {0:9.4f}%'.format(100 * objective), file=sys.stderr) print( '\t- Drop probability: {0:9.4f}%'.format(100 - 100 * keep_prob), file=sys.stderr) if args.validation is not None: print('\t- Val. similarity: {0:9.4f}%'.format(100 * similarity), file=sys.stderr) print('\t- Val. accuracy: {0:9.4f}%'.format(100 * accuracy), file=sys.stderr) print('\t- Val. coverage: {0:9.4f}%'.format( 100 * validation_coverage), file=sys.stderr) sys.stderr.flush() if args.log is not None: val = '{0:.6f}\t{1:.6f}\t{2:.6f}'.format( 100 * similarity, 100 * accuracy, 100 * validation_coverage) if args.validation is not None else '' print('{0}\t{1:.6f}\t{2}\t{3:.6f}'.format( it, 100 * objective, val, duration), file=log) log.flush() t = time.time() it += 1 # draw distribution of language space if args.draw: PCA_model = PCA(n_components=2) x_PCA = PCA_model.fit_transform(asnumpy(xw)) x1 = [feature[0] for feature in x_PCA] y1 = [feature[1] for feature in x_PCA] z_PCA = PCA_model.fit_transform(asnumpy(zw)) x2 = [feature[0] for feature in z_PCA] y2 = [feature[1] for feature in z_PCA] ''' # draw with plt plt.scatter(x2, y2, s=10, c='r', alpha=0.4) plt.scatter(x1, y1, s=10, c='b', alpha=0.2) plt.savefig('./share_space.png') ''' # draw with seaborn plt.figure() sns.jointplot(x1, y1, kind='hex', color='b') plt.savefig('./src_mapped_emb.png') plt.figure() sns.jointplot(x2, y2, kind='hex', color='g') plt.savefig('./trg_mapped_emb.png') # Write mapped embeddings srcfile = open(args.src_output, mode='w', encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_output, mode='w', encoding=args.encoding, errors='surrogateescape') embeddings.write(src_words, xw, srcfile) embeddings.write(trg_words, zw, trgfile) srcfile.close() trgfile.close()
def main(): # Parse command line arguments parser = argparse.ArgumentParser( description='Map word embeddings in two languages into a shared space') parser.add_argument('src_input', help='the input source embeddings') parser.add_argument('trg_input', help='the input target embeddings') parser.add_argument('src_output', help='the output source embeddings') parser.add_argument('trg_output', help='the output target embeddings') parser.add_argument( '--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp32', help='the floating-point precision (defaults to fp32)') parser.add_argument('--cuda', action='store_true', help='use cuda (requires cupy)') parser.add_argument( '--batch_size', default=10000, type=int, help= 'batch size (defaults to 10000); does not affect results, larger is usually faster but uses more memory' ) parser.add_argument('--seed', type=int, default=0, help='the random seed (defaults to 0)') parser.add_argument('--maxiter', type=int, default=10, help='max number of iterations') parser.add_argument('--corekbest', type=int, default=2, help='nn ranking to be considered as a match') parser.add_argument('--decayrate', type=float, default=1.01, help='for boosting') parser.add_argument('--init_vocab', type=int, default=10000, help='for boosting') parser.add_argument('--dictname', default='dict.tmp', help='output the dictionary') recommended_type = parser.add_argument_group( 'recommended settings', 'Recommended settings for different scenarios') recommended_type.add_argument( '--supervised', metavar='DICTIONARY', help='recommended if you have a large training dictionary') recommended_type.add_argument( '--identical', default=True, help= 'recommended if you have no seed dictionary but can rely on identical words' ) init_group = parser.add_argument_group( 'advanced initialization arguments', 'Advanced initialization arguments') init_type = init_group.add_mutually_exclusive_group() init_type.add_argument( '-d', '--init_dictionary', default=sys.stdin.fileno(), metavar='DICTIONARY', help='the training dictionary file (defaults to stdin)') init_type.add_argument('--init_identical', action='store_true', help='use identical words as the seed dictionary') init_type.add_argument( '--init_numerals', action='store_true', help= 'use latin numerals (i.e. words matching [0-9]+) as the seed dictionary' ) init_type.add_argument('--init_unsupervised', action='store_true', help='use unsupervised initialization') init_group.add_argument( '--unsupervised_vocab', type=int, default=0, help= 'restrict the vocabulary to the top k entries for unsupervised initialization' ) mapping_group = parser.add_argument_group( 'advanced mapping arguments', 'Advanced embedding mapping arguments') mapping_group.add_argument( '--normalize', choices=['unit', 'center', 'unitdim', 'centeremb', 'none'], nargs='*', default=[], help='the normalization actions to perform in order') mapping_group.add_argument('--vocabulary', help='restrict source vocab') mapping_type = mapping_group.add_mutually_exclusive_group() mapping_type.add_argument('-c', '--orthogonal', action='store_true', help='use orthogonal constrained mapping') mapping_type.add_argument('-u', '--unconstrained', action='store_true', help='use unconstrained mapping') self_learning_group = parser.add_argument_group( 'advanced self-learning arguments', 'Advanced arguments for self-learning') self_learning_group.add_argument( '--vocabulary_cutoff', type=int, default=0, help='restrict the vocabulary to the top k entries') self_learning_group.add_argument('--csls', type=int, nargs='?', default=0, const=10, metavar='NEIGHBORHOOD_SIZE', dest='csls_neighborhood', help='use CSLS for dictionary induction') self_learning_group.add_argument( '--validation', default=None, metavar='DICTIONARY', help='a dictionary file for validation at each iteration') self_learning_group.add_argument( '--log', help='write to a log file in tsv format at each iteration') self_learning_group.add_argument( '-v', '--verbose', action='store_true', help='write log information to stderr at each iteration') args = parser.parse_args() parser.set_defaults(init_dictionary=args.supervised, normalize=['unit', 'center', 'unit']) args = parser.parse_args() print(args, file=sys.stderr) # Choose the right dtype for the desired precision if args.precision == 'fp16': dtype = 'float16' elif args.precision == 'fp32': dtype = 'float32' elif args.precision == 'fp64': dtype = 'float64' os.makedirs(OUTPUTDIR, exist_ok=True) # Read input embeddings vocabulary = None if args.vocabulary is not None: vocabulary = set() with open(args.vocabulary, encoding=args.encoding, errors='surrogateescape') as file: for l in file: vocabulary.add(l.split()[0]) print(f'vocab size:\t{len(vocabulary)}') with open(args.src_input, encoding=args.encoding, errors='surrogateescape') as srcfile, \ open(args.trg_input, encoding=args.encoding, errors='surrogateescape') as trgfile: src_words, x = embeddings.read(srcfile, dtype=dtype, threshold=args.vocabulary_cutoff, vocabulary=vocabulary) trg_words, z = embeddings.read(trgfile, dtype=dtype, threshold=args.vocabulary_cutoff) embeddings.normalize(x, args.normalize) embeddings.normalize(z, args.normalize) # NumPy/CuPy management if args.cuda: if not supports_cupy(): print('ERROR: Install CuPy for CUDA support', file=sys.stderr) sys.exit(-1) xp = get_cupy() x = xp.asarray(x) z = xp.asarray(z) else: xp = np xp.random.seed(args.seed) # Build word to index map src_word2ind = {word: i for i, word in enumerate(src_words)} trg_word2ind = {word: i for i, word in enumerate(trg_words)} # Build the seed dictionary src_indices = [] trg_indices = [] if args.supervised: f = open(args.init_dictionary, encoding=args.encoding, errors='surrogateescape') for line in f: try: src, trg = line.split()[:2] except ValueError: continue try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src_indices.append(src_ind) trg_indices.append(trg_ind) except KeyError: print('WARNING: OOV dictionary entry ({0} - {1})'.format( src, trg), file=sys.stderr) # Read validation dictionary if args.validation is not None: print('reading validation', file=sys.stderr) f = open(args.validation, encoding=args.encoding, errors='surrogateescape') validation = collections.defaultdict(set) oov = set() vocab = set() for line in f: try: src, trg = line.split() except ValueError: continue try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] validation[src_ind].add(trg_ind) vocab.add(src) except KeyError: oov.add(src) oov -= vocab # If one of the translation options is in the vocabulary, then the entry is not an oov validation_coverage = len(validation) / (len(validation) + len(oov)) # Create log file if args.log: log = open(args.log, mode='w', encoding=args.encoding, errors='surrogateescape') # Allocate memory xw = xp.empty_like(x) zw = xp.empty_like(z) matches = collections.Counter() decided = collections.Counter() cum_weights = collections.Counter(matches) score = collections.Counter() for p in zip(src_indices, trg_indices): matches[p] = 1 decided[p] = 1 identical = set(src_words).intersection(set(trg_words)) for word in list(identical): p = (src_word2ind[word], trg_word2ind[word]) matches[p] = 1 decided[p] = 1 if args.validation is not None: simval = xp.empty((len(validation.keys()), z.shape[0]), dtype=dtype) # Training loop it = 1 t = time.time() wprev = 0 current_vocab = args.init_vocab Stats = collections.namedtuple( 'MatchStats', ['w_dot', 'mean_dot', 'delta_w', 'current_vocab', 'len_match']) pstats = None stats = None while True: src_indices, trg_indices, weights = flatten_match(matches, matches) # x, z = np.array(x0), np.array(z0) embeddings.noise(x) embeddings.noise(z) if args.unconstrained: w = np.linalg.lstsq(np.sqrt(weights) * x[src_indices], np.sqrt(weights) * z[trg_indices], rcond=None)[0] # w = np.linalg.lstsq(x[src_indices], z[trg_indices], rcond=None)[0] x.dot(w, out=xw) zw = z[:] else: u, s, vt = xp.linalg.svd( (weights * z[trg_indices]).T.dot(x[src_indices])) # u, s, vt = xp.linalg.svd(z[trg_indices].T.dot(x[src_indices])) w = vt.T.dot(u.T) x.dot(w, out=xw) zw = z[:] w_dot = np.sum( weights * z[trg_indices] * xw[src_indices]) / weights.sum() mean_dot = np.sum( z[trg_indices] * xw[src_indices]) / len(src_indices) delta_w = np.linalg.norm(w - wprev) stats = Stats(w_dot=w_dot, mean_dot=mean_dot, delta_w=delta_w, current_vocab=current_vocab, len_match=len(src_indices)) if it > 1 and stats.w_dot < pstats.w_dot: current_vocab = min(int(current_vocab * 1.1), args.vocabulary_cutoff) T = 1 * np.exp((it - 1) * np.log(1e-2) / (args.maxiter)) # T = 1 score = collections.Counter() cum_weights = collections.Counter() matches, objective = find_matches(xw, zw, cum_weights, score, ul=current_vocab, T=T, kbest=args.corekbest, csls=args.csls_neighborhood, decay=args.decayrate) for m in decided: decided[m] = decided[m] * (1 - 1 / it) for m in score: if m in score: eta = 1 / it else: eta = max(0.5, 1 / it) decided[m] = decided[m] * (1 - eta) + score[m] * eta # Accuracy and similarity evaluation in validation if args.validation is not None: src = list(validation.keys()) xw[src].dot(zw.T, out=simval) nn = asnumpy(simval.argmax(axis=1)) accuracy = np.mean([ 1 if nn[i] in validation[src[i]] else 0 for i in range(len(src)) ]) similarity = np.mean([ np.max([simval[i, j].tolist() for j in validation[src[i]]]) for i in range(len(src)) ]) with open(f'{OUTPUTDIR}/{args.dictname}.{it}', mode='w') as f: for p in decided.most_common(): si, ti = p[0] print(f'{src_words[si]}\t{trg_words[ti]}\t{p[1]:.3e}', file=f) # Logging duration = time.time() - t if args.verbose: print(file=sys.stderr) print('ITERATION {0} ({1:.2f}s)'.format(it, duration), file=sys.stderr) print('\t- Objective: {0:9.4f}%'.format(100 * objective), file=sys.stderr) print( f'\t- #match/#decided: {len(src_indices)}/{len(decided)}', file=sys.stderr) print(stats, file=sys.stderr) if args.validation is not None: print('\t- Val. similarity: {0:9.4f}%'.format(100 * similarity), file=sys.stderr) print('\t- Val. accuracy: {0:9.4f}%'.format(100 * accuracy), file=sys.stderr) print('\t- Val. coverage: {0:9.4f}%'.format( 100 * validation_coverage), file=sys.stderr) sys.stderr.flush() if args.log is not None: val = '{0:.6f}\t{1:.6f}\t{2:.6f}'.format( 100 * similarity, 100 * accuracy, 100 * validation_coverage) if args.validation is not None else '' print('{0}\t{1:.6f}\t{2}\t{3:.6f}'.format(it, 100 * objective, val, duration), file=log) log.flush() if it >= args.maxiter: break t = time.time() wprev = w pstats = stats it += 1 # write mapped embeddings print('**** reading and writing final embeddings ****', file=sys.stderr) with open(args.src_input, encoding=args.encoding, errors='surrogateescape') as srcfile, \ open(args.trg_input, encoding=args.encoding, errors='surrogateescape') as trgfile: src_words, x = embeddings.read(srcfile, dtype=dtype, threshold=100000) trg_words, z = embeddings.read(trgfile, dtype=dtype, threshold=100000) embeddings.normalize(x, args.normalize) embeddings.normalize(z, args.normalize) with open(args.src_output, mode='w', encoding=args.encoding, errors='surrogateescape') as srcfile, \ open(args.trg_output, mode='w', encoding=args.encoding, errors='surrogateescape') as trgfile: embeddings.write(src_words, x.dot(w), srcfile) embeddings.write(trg_words, z, trgfile)