def main(): # Parse command line arguments parser = argparse.ArgumentParser(description='Project the source embeddings into the target embedding space maximizing the squared Euclidean distances for the given dictionary') parser.add_argument('src_embeddings', help='the source embeddings') parser.add_argument('trg_embeddings', help='the target embeddings') parser.add_argument('-c', '--orthogonal', dest='orthogonal', action='store_true', help='use orthogonal constrained mapping (default)') parser.add_argument('-u', '--unconstrained', dest='orthogonal', action='store_false', help='use unconstrained mapping') parser.add_argument('-d', '--dictionary', default=sys.stdin.fileno(), help='the training dictionary file (defaults to stdin)') parser.add_argument('-o', '--output', default=sys.stdout.fileno(), help='the output projected embedding file (defaults to stdout)') parser.add_argument('--encoding', default='utf-8', action='store_true', help='the character encoding for input/output (defaults to utf-8)') parser.set_defaults(orthogonal=True) args = parser.parse_args() # Read input embeddings srcfile = open(args.src_embeddings, encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_embeddings, encoding=args.encoding, errors='surrogateescape') src_words, src_matrix = embeddings.read(srcfile) trg_words, trg_matrix = embeddings.read(trgfile) # Build word to index map src_word2ind = {word: i for i, word in enumerate(src_words)} trg_word2ind = {word: i for i, word in enumerate(trg_words)} # Read dictionary f = open(args.dictionary, encoding=args.encoding, errors='surrogateescape') src_indices = [] trg_indices = [] for line in f: src, trg = line.split() try: src_ind = src_word2ind[src] #print (str(src_ind)) trg_ind = trg_word2ind[trg] src_indices.append(src_ind) trg_indices.append(trg_ind) except KeyError: print('WARNING: OOV dictionary entry ({0} - {1})'.format(src, trg), file=sys.stderr) # Learn the linear transformation minimizing the squared Euclidean distances (see paper) x = src_matrix[src_indices] #print (x) z = trg_matrix[trg_indices] #print (z) if args.orthogonal: # orthogonal mapping u, s, vt = np.linalg.svd(np.dot(z.T, x)) #print (u) #print("ortho") #u, s, vt = sparse.linalg.svds(np.dot(z.T, x)) w = np.dot(vt.T, u.T) else: # unconstrained mapping x_pseudoinv = np.dot(np.linalg.inv(np.dot(x.T, x)), x.T) w = np.dot(x_pseudoinv, z) #print str(w) # Project and write source embeddings f = open(args.output, mode='w', encoding=args.encoding, errors='surrogateescape') embeddings.write(src_words, np.dot(src_matrix, w), f) a=embeddings.see_mapping(w)
def map_embedding_db(in_emb_fname, out_emb_fname, vocab_type, mapping_model_dir, latent_space=True): """ Maps all the vocabulary in `in_emb_fname` to target language space using the model in `mapping_model_dir` The resultant embeddings are stored in `out_emb_fname` vocab_type is one of `src` or `tgt`. Indicates the source or target language as per the trained model. latent_space: If true, the embeddings are mapped to latent space. Otherwise, they are mapped to the embedding space of the other language. """ print('Loading train data...') # Read input embeddings with open(in_emb_fname, 'r', encoding='utf-8', errors='surrogateescape') as srcfile: src_words, x = embeddings.read(srcfile, max_voc=0, dtype='float32') src_word2ind = {word: i for i, word in enumerate(src_words)} model_params = read_model(mapping_model_dir) xw = apply_mapping(x, vocab_type, model_params, latent_space) with open(out_emb_fname, 'w', encoding='utf-8') as outfile: embeddings.write(src_words, xw, outfile)
def deal_my_with_position_File(file_content, fw_path, fEmPosition): srcfile = open(fw_path, encoding='utf-8', errors='surrogateescape') src_words, src_matrix = embeddings.read(srcfile) map_my_position = {} fr_file = open(file_content, 'r', encoding='utf-8') fr_file = fr_file.readlines() for singleWord in fr_file: singleWord = singleWord.strip().split('\t') map_my_position[singleWord[1]] = singleWord[0] for line in range(len(src_words)): src_words[line] = map_my_position[src_words[line]] targetfile = open(fEmPosition, mode='w', encoding='utf-8', errors='surrogateescape') embeddings.write(src_words, src_matrix, targetfile)
def main(): # Parse command line arguments parser = argparse.ArgumentParser(description='Normalize word embeddings') parser.add_argument( 'actions', choices=['none', 'unit', 'center', 'unitdim', 'centeremb'], nargs='+', help='the actions to perform in order') parser.add_argument( '-i', '--input', default=sys.stdin.fileno(), help='the input word embedding file (defaults to stdin)') parser.add_argument( '-o', '--output', default=sys.stdout.fileno(), help='the output word embedding file (defaults to stdout)') parser.add_argument( '--encoding', default='utf-8', action='store_true', help='the character encoding for input/output (defaults to utf-8)') args = parser.parse_args() # Read input embeddings f = open(args.input, encoding=args.encoding, errors='surrogateescape') words, matrix = embeddings.read(f) # Perform normalization actions for action in args.actions: if action == 'unit': matrix = embeddings.length_normalize(matrix) elif action == 'center': matrix = embeddings.mean_center(matrix) elif action == 'unitdim': matrix = embeddings.length_normalize_dimensionwise(matrix) elif action == 'centeremb': matrix = embeddings.mean_center_embeddingwise(matrix) # Write normalized embeddings f = open(args.output, mode='w', encoding=args.encoding, errors='surrogateescape') embeddings.write(words, matrix, f)
def deal_matrix_words(srcfile, trgfile): srcfile = open(srcfile, encoding='utf-8', errors='surrogateescape') #trgfile = open(args.trg_input, encoding=args.encoding, errors='surrogateescape') src_words, src_matrix = embeddings.read(srcfile) dic_word_matrix = {} srcdata = srcfile.readlines() for line in range(len(src_words)): srcSingle = src_words[line].split('_')[0] if srcSingle not in dic_word_matrix: dic_word_matrix[srcSingle] = [list(src_matrix[line])] else: dic_word_matrix[srcSingle].append(list(src_matrix[line])) words = [] matrixs = [] for itemKey, itemValue in dic_word_matrix.items(): itemValue = np.mean(np.array(itemValue), 0) words.append(itemKey) matrixs.append(itemValue) trgfile = open(trgfile, mode='w', encoding='utf-8', errors='surrogateescape') embeddings.write(words, matrixs, trgfile)
(epoch + 1, i + 1, running_loss / 20)) running_loss = 0.0 source_file = open('new_embedding_size200.en', mode='w', encoding='utf-8', errors='surrogateescape') target_file = open('new_embedding_size200.de', mode='w', encoding='utf-8', errors='surrogateescape') en_words, en_vec = embeddings.read(source_file) de_words, de_vec = embeddings.read(target_file) input_view1, input_view2 = Variable( torch.from_numpy(en_vec).cuda()), Variable( torch.from_numpy(de_vec).cuda()) res_envec, res_devec = net(input_view1.float(), input_view2.float()) src_file = open('res.en', encoding='utf-8', errors='surrogateescape') trg_file = open('res.de', encoding='utf-8', errors='surrogateescape') embeddings.write(en_words, res_envec.numpy(), src_file) embeddings.write(de_words, res_devec.numpy(), trg_file) source_file.close() target_file.close() src_file.close() trg_file.close() print('Finished Training')
def main(): # Parse command line arguments parser = argparse.ArgumentParser( description='Map the source embeddings into the target embedding space' ) parser.add_argument('src_input', help='the input source embeddings') parser.add_argument('trg_input', help='the input target embeddings') parser.add_argument('src_output', help='the output source embeddings') parser.add_argument('trg_output', help='the output target embeddings') parser.add_argument( '--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') mapping_group = parser.add_argument_group( 'mapping arguments', 'Basic embedding mapping arguments (EMNLP 2016)') mapping_group.add_argument( '-d', '--dictionary', default=sys.stdin.fileno(), help='the training dictionary file (defaults to stdin)') mapping_group.add_argument( '--normalize', choices=['unit', 'center', 'unitdim', 'centeremb'], nargs='*', default=[], help='the normalization actions to perform in order') mapping_group.add_argument( '-c', '--orthogonal', dest='orthogonal', action='store_true', help='use orthogonal constrained mapping (default)') mapping_group.add_argument('-u', '--unconstrained', dest='orthogonal', action='store_false', help='use unconstrained mapping') parser.set_defaults(orthogonal=True) self_learning_group = parser.add_argument_group( 'self-learning arguments', 'Optional arguments for self-learning (ACL 2017)') self_learning_group.add_argument('--self_learning', action='store_true', help='enable self-learning') self_learning_group.add_argument( '--direction', choices=['forward', 'backward', 'union'], default='forward', help='the direction for dictionary induction (defaults to forward)') self_learning_group.add_argument( '--numerals', action='store_true', help= 'use latin numerals (i.e. words matching [0-9]+) as the seed dictionary' ) self_learning_group.add_argument( '--threshold', default=0.000001, type=float, help='the convergence threshold (defaults to 0.000001)') self_learning_group.add_argument( '--validation', default=None, help='a dictionary file for validation at each iteration') self_learning_group.add_argument( '--log', help='write to a log file in tsv format at each iteration') self_learning_group.add_argument( '-v', '--verbose', action='store_true', help='write log information to stderr at each iteration') args = parser.parse_args() # Read input embeddings srcfile = open(args.src_input, encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_input, encoding=args.encoding, errors='surrogateescape') src_words, x = embeddings.read(srcfile) trg_words, z = embeddings.read(trgfile) # Build word to index map src_word2ind = {word: i for i, word in enumerate(src_words)} trg_word2ind = {word: i for i, word in enumerate(trg_words)} # Build training dictionary src_indices = [] trg_indices = [] if args.numerals: if args.dictionary != sys.stdin.fileno(): print('WARNING: Using numerals instead of the training dictionary', file=sys.stderr) numeral_regex = re.compile('^[0-9]+$') src_numerals = { word for word in src_words if numeral_regex.match(word) is not None } trg_numerals = { word for word in trg_words if numeral_regex.match(word) is not None } numerals = src_numerals.intersection(trg_numerals) for word in numerals: src_indices.append(src_word2ind[word]) trg_indices.append(trg_word2ind[word]) else: f = open(args.dictionary, encoding=args.encoding, errors='surrogateescape') for line in f: src, trg = line.split() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src_indices.append(src_ind) trg_indices.append(trg_ind) except KeyError: print('WARNING: OOV dictionary entry ({0} - {1})'.format( src, trg), file=sys.stderr) # Read validation dictionary if args.validation is not None: f = open(args.validation, encoding=args.encoding, errors='surrogateescape') validation = collections.defaultdict(set) oov = set() vocab = set() for line in f: src, trg = line.split() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] validation[src_ind].add(trg_ind) vocab.add(src) except KeyError: pass oov.add(src) oov -= vocab # If one of the translation options is in the vocabulary, then the entry is not an oov validation_coverage = len(validation) / (len(validation) + len(oov)) # Create log file if args.log: log = open(args.log, mode='w', encoding=args.encoding, errors='surrogateescape') # Normalize embeddings for action in args.normalize: if action == 'unit': x = embeddings.length_normalize(x) z = embeddings.length_normalize(z) elif action == 'center': x = embeddings.mean_center(x) z = embeddings.mean_center(z) elif action == 'unitdim': x = embeddings.length_normalize_dimensionwise(x) z = embeddings.length_normalize_dimensionwise(z) elif action == 'centeremb': x = embeddings.mean_center_embeddingwise(x) z = embeddings.mean_center_embeddingwise(z) # Training loop prev_objective = objective = -100. it = 1 t = time.time() while it == 1 or objective - prev_objective >= args.threshold: # Update the embedding mapping if args.orthogonal: # orthogonal mapping u, s, vt = np.linalg.svd(np.dot(z[trg_indices].T, x[src_indices])) w = np.dot(vt.T, u.T) else: # unconstrained mapping x_pseudoinv = np.dot( np.linalg.inv(np.dot(x[src_indices].T, x[src_indices])), x[src_indices].T) w = np.dot(x_pseudoinv, z[trg_indices]) xw = x.dot(w) # Self-learning if args.self_learning: # Update the training dictionary best_sim_forward = np.full(x.shape[0], -100.) src_indices_forward = range(x.shape[0]) trg_indices_forward = np.zeros(x.shape[0], dtype=int) best_sim_backward = np.full(z.shape[0], -100.) src_indices_backward = np.zeros(z.shape[0], dtype=int) trg_indices_backward = range(z.shape[0]) for i in range(0, x.shape[0], MAX_DIM_X): for j in range(0, z.shape[0], MAX_DIM_Z): sim = xw[i:i + MAX_DIM_X].dot(z[j:j + MAX_DIM_Z].T) for k in range(sim.shape[0]): l = sim[k].argmax() if sim[k, l] > best_sim_forward[i + k]: best_sim_forward[i + k] = sim[k, l] trg_indices_forward[i + k] = j + l if args.direction in ( 'backward', 'union'): # Slow, only do if necessary for l in range(sim.shape[1]): k = sim[:, l].argmax() if sim[k, l] > best_sim_backward[j + l]: best_sim_backward[j + l] = sim[k, l] src_indices_backward[j + l] = i + k sim = None if args.direction == 'forward': src_indices = src_indices_forward trg_indices = trg_indices_forward elif args.direction == 'backward': src_indices = src_indices_backward trg_indices = trg_indices_backward elif args.direction == 'union': src_indices = np.concatenate( (src_indices_forward, src_indices_backward)) trg_indices = np.concatenate( (trg_indices_forward, trg_indices_backward)) # Objective function evaluation prev_objective = objective if args.direction == 'forward': objective = np.mean(best_sim_forward) elif args.direction == 'backward': objective = np.mean(best_sim_backward) elif args.direction == 'union': objective = (np.mean(best_sim_forward) + np.mean(best_sim_backward)) / 2 # Accuracy and similarity evaluation in validation if args.validation is not None: accuracy = np.mean([ 1 if trg_indices_forward[src] in trg else 0 for src, trg in validation.items() ]) similarity = np.mean([ np.max(z[list(trg)].dot(xw[src])) for src, trg in validation.items() ]) # Logging duration = time.time() - t if args.verbose: print(file=sys.stderr) print('ITERATION {0} ({1:.2f}s)'.format(it, duration), file=sys.stderr) print('\t- Objective: {0:9.4f}%'.format(100 * objective), file=sys.stderr) if args.validation is not None: print('\t- Val. similarity: {0:9.4f}%'.format(100 * similarity), file=sys.stderr) print('\t- Val. accuracy: {0:9.4f}%'.format(100 * accuracy), file=sys.stderr) print('\t- Val. coverage: {0:9.4f}%'.format( 100 * validation_coverage), file=sys.stderr) sys.stderr.flush() if args.log is not None: val = '{0:.6f}\t{1:.6f}\t{2:.6f}'.format( 100 * similarity, 100 * accuracy, 100 * validation_coverage) if args.validation is not None else '' print('{0}\t{1:.6f}\t{2}\t{3:.6f}'.format( it, 100 * objective, val, duration), file=log) log.flush() t = time.time() it += 1 # Write mapped embeddings srcfile = open(args.src_output, mode='w', encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_output, mode='w', encoding=args.encoding, errors='surrogateescape') embeddings.write(src_words, xw, srcfile) embeddings.write(trg_words, z, trgfile) srcfile.close() trgfile.close()
def prefix_oov_embeddings_for_bilingual_dict(train_dict_fname, test_dict_fname, src_emb_fname, tgt_emb_fname, out_src_emb_fname, out_tgt_emb_fname, max_voc=200000): """ Adds the embeddings for OOV words in the training and test dictionaries to the embedding file. This is done by using prefix of the word as well as words for which oov is a prefix. Note that the output embedding file will contain only the OOV words plus the first max_voc words in the original embedding file. train_dict_fname: test_dict_fname: src_emb_fname: embedding file for source language tgt_emb_fname: embedding file for target language out_src_emb_fname: output embedding file for source language out_tgt_emb_fname: output embedding file for target language max_voc: number of vocab items to process from the embedding file """ src_oov_words, src_emb_info, tgt_oov_words, tgt_emb_info = \ get_oov_info_for_bilingual_dict(train_dict_fname, test_dict_fname, src_emb_fname, tgt_emb_fname, max_voc) src_vcb_words, src_emb = src_emb_info tgt_vcb_words, tgt_emb = tgt_emb_info ## compute embeddings for OOV ##### cat queries.txt | ./fasttext print-word-vectors model.bin src_oov_final_words, src_oov_emb = compute_prefix_embeddings( src_oov_words, (src_vcb_words, src_emb)) tgt_oov_final_words, tgt_oov_emb = compute_prefix_embeddings( tgt_oov_words, (tgt_vcb_words, tgt_emb)) if (len(src_oov_words) != len(src_oov_final_words)): print( 'WARNING: Embeddings not computed for {} words out of {} OOV source words' .format( len(src_oov_words) - len(src_oov_final_words), len(src_oov_words))) if (len(tgt_oov_words) != len(tgt_oov_final_words)): print( 'WARNING: Embeddings not computed for {} words out of {} OOV target words' .format( len(tgt_oov_words) - len(tgt_oov_final_words), len(tgt_oov_words))) ## write new embeddings files to disk ## put the OOV words first followed by words in the original embeddings file with open(out_src_emb_fname, 'w', encoding='utf-8' ) as out_src_emb_file, \ open(out_tgt_emb_fname, 'w', encoding='utf-8' ) as out_tgt_emb_file: embeddings.write(src_oov_final_words + src_vcb_words, np.concatenate([src_oov_emb, src_emb]), out_src_emb_file) embeddings.write(tgt_oov_final_words + tgt_vcb_words, np.concatenate([tgt_oov_emb, tgt_emb]), out_tgt_emb_file)
def main(): # Parse command line arguments parser = argparse.ArgumentParser(description='Map word embeddings in two languages into a shared space') parser.add_argument('src_input', help='the input source embeddings') parser.add_argument('trg_input', help='the input target embeddings') parser.add_argument('src_output', help='the output source embeddings') parser.add_argument('trg_output', help='the output target embeddings') parser.add_argument('--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp32', help='the floating-point precision (defaults to fp32)') parser.add_argument('--cuda', action='store_true', help='use cuda (requires cupy)') parser.add_argument('--batch_size', default=10000, type=int, help='batch size (defaults to 10000); does not affect results, larger is usually faster but uses more memory') parser.add_argument('--seed', type=int, default=0, help='the random seed (defaults to 0)') parser.add_argument('--test-dict', help='the test dictionary file') recommended_group = parser.add_argument_group('recommended settings', 'Recommended settings for different scenarios') recommended_type = recommended_group.add_mutually_exclusive_group() recommended_type.add_argument('--supervised', metavar='DICTIONARY', help='recommended if you have a large training dictionary') recommended_type.add_argument('--semi_supervised', metavar='DICTIONARY', help='recommended if you have a small seed dictionary') recommended_type.add_argument('--identical', action='store_true', help='recommended if you have no seed dictionary but can rely on identical words') recommended_type.add_argument('--unsupervised', action='store_true', help='recommended if you have no seed dictionary and do not want to rely on identical words') recommended_type.add_argument('--acl2018', action='store_true', help='reproduce our ACL 2018 system') recommended_type.add_argument('--aaai2018', metavar='DICTIONARY', help='reproduce our AAAI 2018 system') recommended_type.add_argument('--acl2017', action='store_true', help='reproduce our ACL 2017 system with numeral initialization') # Note: changed the argument so that dictionary is supplied with -d instead recommended_type.add_argument('--acl2017_seed', action='store_true', help='reproduce our ACL 2017 system with a seed dictionary') recommended_type.add_argument('--emnlp2016', metavar='DICTIONARY', help='reproduce our EMNLP 2016 system') # still requires specifying a seed dictionary or another init recommended_type.add_argument('--ruder_emnlp2018', action='store_true', help='reproduce EMNLP 2018 latent-variable model of Ruder et al.') recommended_type.add_argument('--ruder_emnlp2018_backward', action='store_true', help='reproduce Ruder et al. (EMNLP 2018) with matching in backward direction') recommended_type.add_argument('--ruder_emnlp2018_artetxe_acl2018_unsupervised', action='store_true', help='reproduce Ruder et al. (EMNLP 2018) with matching in backward direction') recommended_type.add_argument('--ruder_emnlp2018_artetxe_acl2018', action='store_true', help='reproduce Ruder et al. (EMNLP 2018) with matching in backward direction') init_group = parser.add_argument_group('advanced initialization arguments', 'Advanced initialization arguments') init_type = init_group.add_mutually_exclusive_group() init_type.add_argument('-d', '--init_dictionary', default=sys.stdin.fileno(), metavar='DICTIONARY', help='the training dictionary file (defaults to stdin)') init_type.add_argument('--init_identical', action='store_true', help='use identical words as the seed dictionary') init_type.add_argument('--init_numerals', action='store_true', help='use latin numerals (i.e. words matching [0-9]+) as the seed dictionary') init_type.add_argument('--init_unsupervised', action='store_true', help='use unsupervised initialization') init_group.add_argument('--unsupervised_vocab', type=int, default=0, help='restrict the vocabulary to the top k entries for unsupervised initialization') mapping_group = parser.add_argument_group('advanced mapping arguments', 'Advanced embedding mapping arguments') mapping_group.add_argument('--normalize', choices=['unit', 'center', 'unitdim', 'centeremb', 'none'], nargs='*', default=[], help='the normalization actions to perform in order') mapping_group.add_argument('--whiten', action='store_true', help='whiten the embeddings') mapping_group.add_argument('--src_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the source language embeddings') mapping_group.add_argument('--trg_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the target language embeddings') mapping_group.add_argument('--src_dewhiten', choices=['src', 'trg'], help='de-whiten the source language embeddings') mapping_group.add_argument('--trg_dewhiten', choices=['src', 'trg'], help='de-whiten the target language embeddings') mapping_group.add_argument('--dim_reduction', type=int, default=0, help='apply dimensionality reduction') mapping_type = mapping_group.add_mutually_exclusive_group() mapping_type.add_argument('-c', '--orthogonal', action='store_true', help='use orthogonal constrained mapping') mapping_type.add_argument('-u', '--unconstrained', action='store_true', help='use unconstrained mapping') self_learning_group = parser.add_argument_group('advanced self-learning arguments', 'Advanced arguments for self-learning') self_learning_group.add_argument('--self_learning', action='store_true', help='enable self-learning') self_learning_group.add_argument('--vocabulary_cutoff', type=int, default=0, help='restrict the vocabulary to the top k entries') self_learning_group.add_argument('--direction', choices=['forward', 'backward', 'union'], default='union', help='the direction for dictionary induction (defaults to union)') self_learning_group.add_argument('--csls', type=int, nargs='?', default=0, const=10, metavar='NEIGHBORHOOD_SIZE', dest='csls_neighborhood', help='use CSLS for dictionary induction') self_learning_group.add_argument('--threshold', default=0.000001, type=float, help='the convergence threshold (defaults to 0.000001)') self_learning_group.add_argument('--validation', default=None, metavar='DICTIONARY', help='a dictionary file for validation at each iteration') self_learning_group.add_argument('--stochastic_initial', default=0.1, type=float, help='initial keep probability stochastic dictionary induction (defaults to 0.1)') self_learning_group.add_argument('--stochastic_multiplier', default=2.0, type=float, help='stochastic dictionary induction multiplier (defaults to 2.0)') self_learning_group.add_argument('--stochastic_interval', default=50, type=int, help='stochastic dictionary induction interval (defaults to 50)') self_learning_group.add_argument('--log', help='write to a log file in tsv format at each iteration') self_learning_group.add_argument('-v', '--verbose', action='store_true', help='write log information to stderr at each iteration') lat_var_group = parser.add_argument_group('arguments for latent-variable model', 'Arguments for latent-variable model') lat_var_group.add_argument('--lat-var', action='store_true', help='use the latent-variable model') lat_var_group.add_argument('--n-similar', type=int, default=3, help='# of most similar trg indices used for sparsifying in latent-variable model') lat_var_group.add_argument('--n-repeats', default=1, type=int, help='repeats embeddings to get 2:2, 3:3, etc. alignment in latent-variable model') lat_var_group.add_argument('--asym', default='1:1', help='specify 1:2 or 2:1 for assymmetric matching in latent-variable model') args = parser.parse_args() if args.supervised is not None: parser.set_defaults(init_dictionary=args.supervised, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', batch_size=1000) if args.semi_supervised is not None: parser.set_defaults(init_dictionary=args.semi_supervised, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10) if args.identical: parser.set_defaults(init_identical=True, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10) # reduce stochastic interval # note: just backward direction works surprisingly well if args.ruder_emnlp2018_artetxe_acl2018_unsupervised: parser.set_defaults(init_unsupervised=True, unsupervised_vocab=4000, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=40000, csls_neighborhood=10, lat_var=True, n_similar=3, direction='union', stochastic_interval=3) if args.ruder_emnlp2018_artetxe_acl2018: parser.set_defaults(normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=40000, csls_neighborhood=10, lat_var=True, n_similar=3, direction='union', stochastic_interval=3) if args.ruder_emnlp2018: parser.set_defaults(orthogonal=True, normalize=['unit', 'center'], self_learning=True, direction='forward', stochastic_initial=1.0, stochastic_interval=1, batch_size=1000, lat_var=True, n_similar=3, vocabulary_cutoff=40000) if args.ruder_emnlp2018_backward: parser.set_defaults(orthogonal=True, normalize=['unit', 'center'], self_learning=True, direction='backward', stochastic_initial=1.0, stochastic_interval=1, batch_size=1000, lat_var=True, n_similar=3, vocabulary_cutoff=40000) if args.unsupervised or args.acl2018: parser.set_defaults(init_unsupervised=True, unsupervised_vocab=4000, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10) if args.aaai2018: parser.set_defaults(init_dictionary=args.aaai2018, normalize=['unit', 'center'], whiten=True, trg_reweight=1, src_dewhiten='src', trg_dewhiten='trg', batch_size=1000) if args.acl2017: parser.set_defaults(init_numerals=True, orthogonal=True, normalize=['unit', 'center'], self_learning=True, direction='forward', stochastic_initial=1.0, stochastic_interval=1, batch_size=1000) if args.acl2017_seed: parser.set_defaults(init_dictionary=args.init_dictionary, orthogonal=True, normalize=['unit', 'center'], self_learning=True, direction='forward', stochastic_initial=1.0, stochastic_interval=1, batch_size=1000) if args.emnlp2016: parser.set_defaults(init_dictionary=args.emnlp2016, orthogonal=True, normalize=['unit', 'center'], batch_size=1000) args = parser.parse_args() # Check command line arguments if (args.src_dewhiten is not None or args.trg_dewhiten is not None) and not args.whiten: print('ERROR: De-whitening requires whitening first', file=sys.stderr) sys.exit(-1) if args.verbose: print("Info: arguments\n\t" + "\n\t".join( ["{}: {}".format(a, v) for a, v in vars(args).items()]), file=sys.stderr) # Choose the right dtype for the desired precision if args.precision == 'fp16': dtype = 'float16' elif args.precision == 'fp32': dtype = 'float32' elif args.precision == 'fp64': dtype = 'float64' # Read input embeddings srcfile = open(args.src_input, encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_input, encoding=args.encoding, errors='surrogateescape') src_words, x = embeddings.read(srcfile, dtype=dtype, threshold=200000) trg_words, z = embeddings.read(trgfile, dtype=dtype, threshold=200000) # NumPy/CuPy management if args.cuda: if not supports_cupy(): print('ERROR: Install CuPy for CUDA support', file=sys.stderr) sys.exit(-1) xp = get_cupy() x = xp.asarray(x) z = xp.asarray(z) else: xp = np xp.random.seed(args.seed) # Build word to index map src_word2ind = {word: i for i, word in enumerate(src_words)} trg_word2ind = {word: i for i, word in enumerate(trg_words)} # STEP 0: Normalization embeddings.normalize(x, args.normalize) embeddings.normalize(z, args.normalize) # Build the seed dictionary src_indices = [] trg_indices = [] if args.init_unsupervised: if args.verbose: print('Using unsupervised initialization...') sim_size = min(x.shape[0], z.shape[0]) if args.unsupervised_vocab <= 0 else min(x.shape[0], z.shape[0], args.unsupervised_vocab) u, s, vt = xp.linalg.svd(x[:sim_size], full_matrices=False) xsim = (u*s).dot(u.T) u, s, vt = xp.linalg.svd(z[:sim_size], full_matrices=False) zsim = (u*s).dot(u.T) del u, s, vt xsim.sort(axis=1) zsim.sort(axis=1) embeddings.normalize(xsim, args.normalize) embeddings.normalize(zsim, args.normalize) sim = xsim.dot(zsim.T) if args.csls_neighborhood > 0: knn_sim_fwd = topk_mean(sim, k=args.csls_neighborhood) knn_sim_bwd = topk_mean(sim.T, k=args.csls_neighborhood) sim -= knn_sim_fwd[:, xp.newaxis]/2 + knn_sim_bwd/2 if args.direction == 'forward': src_indices = xp.arange(sim_size) trg_indices = sim.argmax(axis=1) elif args.direction == 'backward': src_indices = sim.argmax(axis=0) trg_indices = xp.arange(sim_size) elif args.direction == 'union': src_indices = xp.concatenate((xp.arange(sim_size), sim.argmax(axis=0))) trg_indices = xp.concatenate((sim.argmax(axis=1), xp.arange(sim_size))) del xsim, zsim, sim elif args.init_numerals: if args.verbose: print('Using numerals as seeds...') numeral_regex = re.compile('^[0-9]+$') src_numerals = {word for word in src_words if numeral_regex.match(word) is not None} trg_numerals = {word for word in trg_words if numeral_regex.match(word) is not None} numerals = src_numerals.intersection(trg_numerals) for word in numerals: src_indices.append(src_word2ind[word]) trg_indices.append(trg_word2ind[word]) elif args.init_identical: identical = set(src_words).intersection(set(trg_words)) if args.verbose: print('Using identical strings as seeds...') print(f'Found {len(identical)} identical strings.') for word in identical: src_indices.append(src_word2ind[word]) trg_indices.append(trg_word2ind[word]) else: f = open(args.init_dictionary, encoding=args.encoding, errors='surrogateescape') for line in f: src, trg = line.split() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src_indices.append(src_ind) trg_indices.append(trg_ind) except KeyError: print('WARNING: OOV dictionary entry ({0} - {1})'.format(src, trg), file=sys.stderr) print(f'Using a dictionary of size {len(src_indices)}.') # Read validation dictionary if args.validation is not None: f = open(args.validation, encoding=args.encoding, errors='surrogateescape') validation = collections.defaultdict(set) oov = set() vocab = set() for line in f: src, trg = line.split() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] validation[src_ind].add(trg_ind) vocab.add(src) except KeyError: oov.add(src) oov -= vocab # If one of the translation options is in the vocabulary, then the entry is not an oov validation_coverage = len(validation) / (len(validation) + len(oov)) # Create log file if args.log: log = open(args.log, mode='w', encoding=args.encoding, errors='surrogateescape') # Allocate memory xw = xp.empty_like(x) zw = xp.empty_like(z) src_size = x.shape[0] if args.vocabulary_cutoff <= 0 else min(x.shape[0], args.vocabulary_cutoff) trg_size = z.shape[0] if args.vocabulary_cutoff <= 0 else min(z.shape[0], args.vocabulary_cutoff) simfwd = xp.empty((args.batch_size, trg_size), dtype=dtype) simbwd = xp.empty((args.batch_size, src_size), dtype=dtype) if args.validation is not None: simval = xp.empty((len(validation.keys()), z.shape[0]), dtype=dtype) best_sim_forward = xp.full(src_size, -100, dtype=dtype) src_indices_forward = xp.arange(src_size) trg_indices_forward = xp.zeros(src_size, dtype=int) best_sim_backward = xp.full(trg_size, -100, dtype=dtype) src_indices_backward = xp.zeros(trg_size, dtype=int) trg_indices_backward = xp.arange(trg_size) knn_sim_fwd = xp.zeros(src_size, dtype=dtype) knn_sim_bwd = xp.zeros(trg_size, dtype=dtype) # Training loop best_objective = objective = -100. it = 1 last_improvement = 0 keep_prob = args.stochastic_initial t = time.time() end = not args.self_learning while True: # Increase the keep probability if we have not improve in args.stochastic_interval iterations if it - last_improvement > args.stochastic_interval: if keep_prob >= 1.0: end = True keep_prob = min(1.0, args.stochastic_multiplier*keep_prob) last_improvement = it # Update the embedding mapping if args.orthogonal or not end: # orthogonal mapping u, s, vt = xp.linalg.svd(z[trg_indices].T.dot(x[src_indices])) w = vt.T.dot(u.T) x.dot(w, out=xw) zw[:] = z elif args.unconstrained: # unconstrained mapping x_pseudoinv = xp.linalg.inv(x[src_indices].T.dot(x[src_indices])).dot(x[src_indices].T) w = x_pseudoinv.dot(z[trg_indices]) x.dot(w, out=xw) zw[:] = z else: # advanced mapping # TODO xw.dot(wx2, out=xw) and alike not working xw[:] = x zw[:] = z # STEP 1: Whitening def whitening_transformation(m): u, s, vt = xp.linalg.svd(m, full_matrices=False) return vt.T.dot(xp.diag(1/s)).dot(vt) if args.whiten: wx1 = whitening_transformation(xw[src_indices]) wz1 = whitening_transformation(zw[trg_indices]) xw = xw.dot(wx1) zw = zw.dot(wz1) # STEP 2: Orthogonal mapping wx2, s, wz2_t = xp.linalg.svd(xw[src_indices].T.dot(zw[trg_indices])) wz2 = wz2_t.T xw = xw.dot(wx2) zw = zw.dot(wz2) # STEP 3: Re-weighting xw *= s**args.src_reweight zw *= s**args.trg_reweight # STEP 4: De-whitening if args.src_dewhiten == 'src': xw = xw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2)) elif args.src_dewhiten == 'trg': xw = xw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2)) if args.trg_dewhiten == 'src': zw = zw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2)) elif args.trg_dewhiten == 'trg': zw = zw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2)) # STEP 5: Dimensionality reduction if args.dim_reduction > 0: xw = xw[:, :args.dim_reduction] zw = zw[:, :args.dim_reduction] # Self-learning if end: break else: # Update the training dictionary sims = np.zeros((src_size, trg_size), dtype=dtype) if args.direction in ('forward', 'union'): if args.csls_neighborhood > 0: for i in range(0, trg_size, simbwd.shape[0]): j = min(i + simbwd.shape[0], trg_size) zw[i:j].dot(xw[:src_size].T, out=simbwd[:j-i]) knn_sim_bwd[i:j] = topk_mean(simbwd[:j-i], k=args.csls_neighborhood, inplace=True) for i in range(0, src_size, simfwd.shape[0]): j = min(i + simfwd.shape[0], src_size) xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j-i]) simfwd[:j-i].max(axis=1, out=best_sim_forward[i:j]) simfwd[:j-i] -= knn_sim_bwd/2 # Equivalent to the real CSLS scores for NN simfwd[:j-i] = dropout(simfwd[:j-i], 1 - keep_prob) if not args.lat_var: # we get a dimension mismatch here as lat_var may produce fewer seeds simfwd[:j-i].argmax(axis=1, out=trg_indices_forward[i:j]) sims[i:j] = simfwd if args.lat_var: # TODO check if we can save memory by not storing a large sims matrix src_indices_forward, trg_indices_forward = lat_var.lat_var( xp, sims, args.n_similar, args.n_repeats, args.batch_size, args.asym) if args.direction in ('backward', 'union'): if args.csls_neighborhood > 0: for i in range(0, src_size, simfwd.shape[0]): j = min(i + simfwd.shape[0], src_size) xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j-i]) knn_sim_fwd[i:j] = topk_mean(simfwd[:j-i], k=args.csls_neighborhood, inplace=True) for i in range(0, trg_size, simbwd.shape[0]): j = min(i + simbwd.shape[0], trg_size) zw[i:j].dot(xw[:src_size].T, out=simbwd[:j-i]) simbwd[:j-i].max(axis=1, out=best_sim_backward[i:j]) simbwd[:j-i] -= knn_sim_fwd/2 # Equivalent to the real CSLS scores for NN simbwd[:j-i] = dropout(simbwd[:j-i], 1 - keep_prob) if not args.lat_var: simbwd[:j-i].argmax(axis=1,out=src_indices_backward[i:j]) sims[i:j] = simbwd if args.lat_var: # swap the order of the indices trg_indices_backward, src_indices_backward = lat_var.lat_var( xp, sims, args.n_similar, args.n_repeats, args.batch_size, args.asym) if args.direction == 'forward': src_indices = src_indices_forward trg_indices = trg_indices_forward elif args.direction == 'backward': src_indices = src_indices_backward trg_indices = trg_indices_backward elif args.direction == 'union': src_indices = xp.concatenate((src_indices_forward, src_indices_backward)) trg_indices = xp.concatenate((trg_indices_forward, trg_indices_backward)) # elif args.direction == 'intersection': # fwd_pairs = zip(src_indices_forward, trg_indices_forward) # bwd_pairs = zip(src_indices_backward, trg_indices_backward) # src_indices, trg_indices = zip(*set(fwd_pairs).intersection(bwd_pairs)) # src_indices, trg_indices = xp.array(src_indices), xp.array(trg_indices) # Objective function evaluation if args.direction == 'forward': objective = xp.mean(best_sim_forward).tolist() elif args.direction == 'backward': objective = xp.mean(best_sim_backward).tolist() elif args.direction == 'union': objective = (xp.mean(best_sim_forward) + xp.mean(best_sim_backward)).tolist() / 2 if objective - best_objective >= args.threshold: last_improvement = it best_objective = objective # Accuracy and similarity evaluation in validation if args.validation is not None: src = list(validation.keys()) xw[src].dot(zw.T, out=simval) nn = asnumpy(simval.argmax(axis=1)) accuracy = np.mean([1 if nn[i] in validation[src[i]] else 0 for i in range(len(src))]) similarity = np.mean([max([simval[i, j].tolist() for j in validation[src[i]]]) for i in range(len(src))]) # Logging duration = time.time() - t if args.verbose: print(file=sys.stderr) print('ITERATION {0} ({1:.2f}s)'.format(it, duration), file=sys.stderr) print('\t- Objective: {0:9.4f}%'.format(100 * objective), file=sys.stderr) print('\t- Drop probability: {0:9.4f}%'.format(100 - 100*keep_prob), file=sys.stderr) if args.validation is not None: print('\t- Val. similarity: {0:9.4f}%'.format(100 * similarity), file=sys.stderr) print('\t- Val. accuracy: {0:9.4f}%'.format(100 * accuracy), file=sys.stderr) print('\t- Val. coverage: {0:9.4f}%'.format(100 * validation_coverage), file=sys.stderr) sys.stderr.flush() if args.log is not None: val = '{0:.6f}\t{1:.6f}\t{2:.6f}'.format( 100 * similarity, 100 * accuracy, 100 * validation_coverage) if args.validation is not None else '' print('{0}\t{1:.6f}\t{2}\t{3:.6f}'.format(it, 100 * objective, val, duration), file=log) log.flush() t = time.time() it += 1 if args.test_dict: # save the embeddings for evaluation with open(args.src_output, mode='w', encoding=args.encoding, errors='surrogateescape') as srcfile,\ open(args.trg_output, mode='w', encoding=args.encoding, errors='surrogateescape') as trgfile: embeddings.write(src_words, xw, srcfile) embeddings.write(trg_words, zw, trgfile) # EVALUATING TRANSLATION print('Evaluating translation...') # we skip length normalization here # Read dictionary and compute coverage f = open(args.test_dict, encoding=args.encoding, errors='surrogateescape') src2trg = collections.defaultdict(set) oov = set() vocab = set() for line in f: src, trg = line.split() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src2trg[src_ind].add(trg_ind) vocab.add(src) except KeyError: oov.add(src) src = list(src2trg.keys()) oov -= vocab # If one of the translation options is in the vocabulary, then the entry is not an oov coverage = len(src2trg) / (len(src2trg) + len(oov)) BATCH_SIZE = 500 # Find translations translation = collections.defaultdict(int) # we just use nearest neighbour for retrieval for i in range(0, len(src), BATCH_SIZE): j = min(i + BATCH_SIZE, len(src)) similarities = xw[src[i:j]].dot(zw.T) nn = similarities.argmax(axis=1).tolist() for k in range(j - i): translation[src[i + k]] = nn[k] # Compute accuracy accuracy = np.mean( [1 if translation[i] in src2trg[i] else 0 for i in src]) print('Coverage:{0:7.2%} Accuracy:{1:7.2%}'.format(coverage, accuracy)) # Write mapped embeddings with open(args.src_output, mode='w', encoding=args.encoding, errors='surrogateescape') as srcfile, \ open(args.trg_output, mode='w', encoding=args.encoding, errors='surrogateescape') as trgfile: embeddings.write(src_words, xw, srcfile) embeddings.write(trg_words, zw, trgfile)
def main(): # Parse command line arguments parser = argparse.ArgumentParser(description='Map word embeddings in two languages into a shared space') parser.add_argument('src_input', help='the input source embeddings') parser.add_argument('trg_input', help='the input target embeddings') parser.add_argument('src_output', help='the output source embeddings') parser.add_argument('trg_output', help='the output target embeddings') parser.add_argument('dict_output', default='dictionary.pkl', help='the output dictionary pickle file') parser.add_argument('--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp32', help='the floating-point precision (defaults to fp32)') parser.add_argument('--cuda', action='store_true', help='use cuda (requires cupy)') parser.add_argument('--batch_size', default=10000, type=int, help='batch size (defaults to 10000); does not affect results, larger is usually faster but uses more memory') parser.add_argument('--seed', type=int, default=0, help='the random seed (defaults to 0)') recommended_group = parser.add_argument_group('recommended settings', 'Recommended settings for different scenarios') recommended_type = recommended_group.add_mutually_exclusive_group() recommended_type.add_argument('--supervised', metavar='DICTIONARY', help='recommended if you have a large training dictionary') recommended_type.add_argument('--semi_supervised', metavar='DICTIONARY', help='recommended if you have a small seed dictionary') recommended_type.add_argument('--identical', action='store_true', help='recommended if you have no seed dictionary but can rely on identical words') recommended_type.add_argument('--unsupervised', action='store_true', help='recommended if you have no seed dictionary and do not want to rely on identical words') recommended_type.add_argument('--future', action='store_true', help='experiment with stuff') recommended_type.add_argument('--acl2018', action='store_true', help='reproduce our ACL 2018 system') recommended_type.add_argument('--aaai2018', metavar='DICTIONARY', help='reproduce our AAAI 2018 system') recommended_type.add_argument('--acl2017', action='store_true', help='reproduce our ACL 2017 system with numeral initialization') recommended_type.add_argument('--acl2017_seed', metavar='DICTIONARY', help='reproduce our ACL 2017 system with a seed dictionary') recommended_type.add_argument('--emnlp2016', metavar='DICTIONARY', help='reproduce our EMNLP 2016 system') init_group = parser.add_argument_group('advanced initialization arguments', 'Advanced initialization arguments') init_type = init_group.add_mutually_exclusive_group() init_type.add_argument('-d', '--init_dictionary', default=sys.stdin.fileno(), metavar='DICTIONARY', help='the training dictionary file (defaults to stdin)') init_type.add_argument('--init_identical', action='store_true', help='use identical words as the seed dictionary') init_type.add_argument('--init_numerals', action='store_true', help='use latin numerals (i.e. words matching [0-9]+) as the seed dictionary') init_type.add_argument('--init_unsupervised', action='store_true', help='use unsupervised initialization') init_group.add_argument('--unsupervised_vocab', type=int, default=0, help='restrict the vocabulary to the top k entries for unsupervised initialization') mapping_group = parser.add_argument_group('advanced mapping arguments', 'Advanced embedding mapping arguments') mapping_group.add_argument('--normalize', choices=['unit', 'center', 'unitdim', 'centeremb', 'none'], nargs='*', default=[], help='the normalization actions to perform in order') mapping_group.add_argument('--whiten', action='store_true', help='whiten the embeddings') mapping_group.add_argument('--src_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the source language embeddings') mapping_group.add_argument('--trg_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the target language embeddings') mapping_group.add_argument('--src_dewhiten', choices=['src', 'trg'], help='de-whiten the source language embeddings') mapping_group.add_argument('--trg_dewhiten', choices=['src', 'trg'], help='de-whiten the target language embeddings') mapping_group.add_argument('--dim_reduction', type=int, default=0, help='apply dimensionality reduction') mapping_type = mapping_group.add_mutually_exclusive_group() mapping_type.add_argument('-c', '--orthogonal', action='store_true', help='use orthogonal constrained mapping') mapping_type.add_argument('-u', '--unconstrained', action='store_true', help='use unconstrained mapping') future_group = parser.add_argument_group('experimental arguments', 'Experimental arguments') future_group.add_argument('--max_align', type=int, default=1, help='Number of top-ranked elements to align to each word (defaults to 1=base)') future_group.add_argument('--align_weight', choices=['unit', 'rr', 'softmax'], default='rr', help='Weights assigned to ranked elements in maximization phase (unit - no weighting; rr - reciprocal rank; softmax - NOT IMPLEMENTED YET)') self_learning_group = parser.add_argument_group('advanced self-learning arguments', 'Advanced arguments for self-learning') self_learning_group.add_argument('--self_learning', action='store_true', help='enable self-learning') self_learning_group.add_argument('--vocabulary_cutoff', type=int, default=0, help='restrict the vocabulary to the top k entries') self_learning_group.add_argument('--direction', choices=['forward', 'backward', 'union'], default='union', help='the direction for dictionary induction (defaults to union)') self_learning_group.add_argument('--csls', type=int, nargs='?', default=0, const=10, metavar='NEIGHBORHOOD_SIZE', dest='csls_neighborhood', help='use CSLS for dictionary induction') self_learning_group.add_argument('--threshold', default=0.000001, type=float, help='the convergence threshold (defaults to 0.000001)') self_learning_group.add_argument('--validation', default=None, metavar='DICTIONARY', help='a dictionary file for validation at each iteration') self_learning_group.add_argument('--stochastic_initial', default=0.1, type=float, help='initial keep probability stochastic dictionary induction (defaults to 0.1)') self_learning_group.add_argument('--stochastic_multiplier', default=2.0, type=float, help='stochastic dictionary induction multiplier (defaults to 2.0)') self_learning_group.add_argument('--stochastic_interval', default=50, type=int, help='stochastic dictionary induction interval (defaults to 50)') self_learning_group.add_argument('--log', default='map.log', help='write to a log file in tsv format at each iteration') self_learning_group.add_argument('-v', '--verbose', action='store_true', help='write log information to stderr at each iteration') args = parser.parse_args() if args.supervised is not None: parser.set_defaults(init_dictionary=args.supervised, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', batch_size=1000) if args.semi_supervised is not None: parser.set_defaults(init_dictionary=args.semi_supervised, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10) if args.identical: parser.set_defaults(init_identical=True, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10) if args.unsupervised or args.future: parser.set_defaults(init_unsupervised=True, unsupervised_vocab=4000, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10, max_align=2, align_weight='rr') if args.unsupervised or args.acl2018: parser.set_defaults(init_unsupervised=True, unsupervised_vocab=4000, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10) if args.aaai2018: parser.set_defaults(init_dictionary=args.aaai2018, normalize=['unit', 'center'], whiten=True, trg_reweight=1, src_dewhiten='src', trg_dewhiten='trg', batch_size=1000) if args.acl2017: parser.set_defaults(init_numerals=True, orthogonal=True, normalize=['unit', 'center'], self_learning=True, direction='forward', stochastic_initial=1.0, stochastic_interval=1, batch_size=1000) if args.acl2017_seed: parser.set_defaults(init_dictionary=args.acl2017_seed, orthogonal=True, normalize=['unit', 'center'], self_learning=True, direction='forward', stochastic_initial=1.0, stochastic_interval=1, batch_size=1000) if args.emnlp2016: parser.set_defaults(init_dictionary=args.emnlp2016, orthogonal=True, normalize=['unit', 'center'], batch_size=1000) args = parser.parse_args() # Check command line arguments if (args.src_dewhiten is not None or args.trg_dewhiten is not None) and not args.whiten: print('ERROR: De-whitening requires whitening first', file=sys.stderr) sys.exit(-1) # Choose the right dtype for the desired precision if args.precision == 'fp16': dtype = 'float16' elif args.precision == 'fp32': dtype = 'float32' elif args.precision == 'fp64': dtype = 'float64' # Read input embeddings print('reading embeddings...') srcfile = open(args.src_input, encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_input, encoding=args.encoding, errors='surrogateescape') src_words, x = embeddings.read(srcfile, dtype=dtype) trg_words, z = embeddings.read(trgfile, dtype=dtype) print('embeddings read') # NumPy/CuPy management if args.cuda: if not supports_cupy(): print('ERROR: Install CuPy for CUDA support', file=sys.stderr) sys.exit(-1) xp = get_cupy() x = xp.asarray(x) z = xp.asarray(z) print('CUDA loaded') else: xp = np xp.random.seed(args.seed) # Build word to index map (only relevant in supervised learning or with validation) src_word2ind = {word: i for i, word in enumerate(src_words)} print(f'mapped {len(src_words)} source words') trg_word2ind = {word: i for i, word in enumerate(trg_words)} print(f'mapped {len(trg_words)} target words') # STEP 0: Normalization embeddings.normalize(x, args.normalize) embeddings.normalize(z, args.normalize) print('normalization complete') # Build the seed dictionary src_indices = [] trg_indices = [] if args.init_unsupervised: sim_size = min(x.shape[0], z.shape[0]) if args.unsupervised_vocab <= 0 else min(x.shape[0], z.shape[0], args.unsupervised_vocab) u, s, vt = xp.linalg.svd(x[:sim_size], full_matrices=False) xsim = (u*s).dot(u.T) u, s, vt = xp.linalg.svd(z[:sim_size], full_matrices=False) zsim = (u*s).dot(u.T) del u, s, vt xsim.sort(axis=1) zsim.sort(axis=1) embeddings.normalize(xsim, args.normalize) embeddings.normalize(zsim, args.normalize) sim = xsim.dot(zsim.T) if args.csls_neighborhood > 0: knn_sim_fwd = topk_mean(sim, k=args.csls_neighborhood) knn_sim_bwd = topk_mean(sim.T, k=args.csls_neighborhood) sim -= knn_sim_fwd[:, xp.newaxis]/2 + knn_sim_bwd/2 if args.direction == 'forward': src_indices = xp.arange(sim_size) trg_indices = sim.argmax(axis=1) elif args.direction == 'backward': src_indices = sim.argmax(axis=0) trg_indices = xp.arange(sim_size) elif args.direction == 'union': src_indices = xp.concatenate((xp.arange(sim_size), sim.argmax(axis=0))) trg_indices = xp.concatenate((sim.argmax(axis=1), xp.arange(sim_size))) del xsim, zsim, sim print(f'initialized unsupervised dictionary') elif args.init_numerals: numeral_regex = re.compile('^[0-9]+$') src_numerals = {word for word in src_words if numeral_regex.match(word) is not None} trg_numerals = {word for word in trg_words if numeral_regex.match(word) is not None} numerals = src_numerals.intersection(trg_numerals) for word in numerals: src_indices.append(src_word2ind[word]) trg_indices.append(trg_word2ind[word]) print('initialized numeral dictionary') elif args.init_identical: identical = set(src_words).intersection(set(trg_words)) for word in identical: src_indices.append(src_word2ind[word]) trg_indices.append(trg_word2ind[word]) print('initialized identical dictionary') else: f = open(args.init_dictionary, encoding=args.encoding, errors='surrogateescape') for line in f: src, trg = line.split() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src_indices.append(src_ind) trg_indices.append(trg_ind) except KeyError: print('WARNING: OOV dictionary entry ({0} - {1})'.format(src, trg), file=sys.stderr) f.close() print('initialized seed dictionary') # Read validation dictionary if args.validation is not None: f = open(args.validation, encoding=args.encoding, errors='surrogateescape') validation = collections.defaultdict(set) oov = set() vocab = set() for line in f: src, trg = line.split() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] validation[src_ind].add(trg_ind) vocab.add(src) except KeyError: oov.add(src) oov -= vocab # If one of the translation options is in the vocabulary, then the entry is not an oov validation_coverage = len(validation) / (len(validation) + len(oov)) print(f'loaded validation dictionary with {validation_coverage:.3f} coverage') # Create log file if args.log: log = open(args.log, mode='w', encoding=args.encoding, errors='surrogateescape') print(f'logging into {args.log}') # Allocate memory xw = xp.empty_like(x) zw = xp.empty_like(z) src_size = x.shape[0] if args.vocabulary_cutoff <= 0 else min(x.shape[0], args.vocabulary_cutoff) trg_size = z.shape[0] if args.vocabulary_cutoff <= 0 else min(z.shape[0], args.vocabulary_cutoff) simfwd = xp.empty((min(src_size, args.batch_size), trg_size), dtype=dtype) simbwd = xp.empty((min(trg_size, args.batch_size), src_size), dtype=dtype) #argsimsf = xp.empty((min(src_size, args.batch_size), args.max_align), dtype=int) #argsimsb = xp.empty((min(trg_size, args.batch_size), args.max_align), dtype=int) argsimsf = xp.empty((min(src_size, args.batch_size), 1), dtype=int) argsimsb = xp.empty((min(trg_size, args.batch_size), 1), dtype=int) if args.validation is not None: simval = xp.empty((len(validation.keys()), z.shape[0]), dtype=dtype) best_sim_forward = xp.full(src_size, -100, dtype=dtype) src_indices_forward = xp.array(list(range(src_size)) * args.max_align) trg_indices_forward = xp.zeros(src_size * args.max_align, dtype=int) best_sim_backward = xp.full(trg_size, -100, dtype=dtype) src_indices_backward = xp.zeros(trg_size * args.max_align, dtype=int) trg_indices_backward = xp.array(list(range(trg_size)) * args.max_align) xr = xp.zeros(((src_size+trg_size) * args.max_align, x.shape[1]), dtype=dtype) # assumes "both" param zr = xp.zeros(((src_size+trg_size) * args.max_align, z.shape[1]), dtype=dtype) # assumes "both" param all_coefs = xp.zeros(((src_size+trg_size) * args.max_align, 1), dtype=dtype) knn_sim_fwd = xp.zeros(src_size, dtype=dtype) knn_sim_bwd = xp.zeros(trg_size, dtype=dtype) # Training loop best_objective = objective = -100. it = 1 last_improvement = 0 keep_prob = args.stochastic_initial t = time.time() end = not args.self_learning print('starting training') while True: if it % 50 == 0: print(f'starting iteration {it}') # Increase the keep probability if we have not improved in args.stochastic_interval iterations if it - last_improvement > args.stochastic_interval: if keep_prob >= 1.0: end = True keep_prob = min(1.0, args.stochastic_multiplier*keep_prob) last_improvement = it # Update the embedding mapping (only affecting vectors that have dictionary mappings) if args.orthogonal or not end: # orthogonal mapping if it == 1: # only initialized alignment available u, s, vt = xp.linalg.svd(z[trg_indices].T.dot(x[src_indices])) else: if args.align_weight == 'softmax': ### TODO individualized softmax coefficients ### raise 'Softmax weights not supported yet' else: ### TODO I'm assuming here that the alignment method is 'both', so everything's double ### TODO all_coefs can be computed outside the iteration loop # format: src_size_0, ..., src_size_k-1, trg_size_0, ..., trg_size_k-1 ncopies = args.max_align cutoffs = list(range(src_size*ncopies)[::src_size]) \ + list(range(src_size*ncopies,(src_size+trg_size)*ncopies)[::trg_size]) if args.align_weight == 'rr': coefs = [1. / (k+1) for k in range(ncopies)] * 2 else: # 'unit' coefs = [1.] * (ncopies * 2) for cf, co_s, co_e in zip(coefs, cutoffs, cutoffs[1:] + [len(all_coefs)]): all_coefs[co_s:co_e] = cf zr = z[trg_indices] * all_coefs xr = x[src_indices] * all_coefs u, s, vt = xp.linalg.svd(zr.T.dot(xr)) w = vt.T.dot(u.T) x.dot(w, out=xw) zw[:] = z elif args.unconstrained: # unconstrained mapping x_pseudoinv = xp.linalg.inv(x[src_indices].T.dot(x[src_indices])).dot(x[src_indices].T) w = x_pseudoinv.dot(z[trg_indices]) x.dot(w, out=xw) zw[:] = z else: # advanced mapping (default for end, acl2018) # remove lower-rank transformations midpoint = src_size * args.max_align src_indices = xp.concatenate((src_indices[:src_size], src_indices[midpoint:midpoint+trg_size])) trg_indices = xp.concatenate((trg_indices[:src_size], trg_indices[midpoint:midpoint+trg_size])) # TODO xw.dot(wx2, out=xw) and alike not working xw[:] = x zw[:] = z ### TODO entry point for adding more matrix operations ### # STEP 1: Whitening ### TODO figure out how weighted k-best affects this (and onwards) ### def whitening_transformation(m): u, s, vt = xp.linalg.svd(m, full_matrices=False) return vt.T.dot(xp.diag(1/s)).dot(vt) if args.whiten: wx1 = whitening_transformation(xw[src_indices]) wz1 = whitening_transformation(zw[trg_indices]) xw = xw.dot(wx1) zw = zw.dot(wz1) # STEP 2: Orthogonal mapping wx2, s, wz2_t = xp.linalg.svd(xw[src_indices].T.dot(zw[trg_indices])) wz2 = wz2_t.T xw = xw.dot(wx2) zw = zw.dot(wz2) # STEP 3: Re-weighting xw *= s**args.src_reweight zw *= s**args.trg_reweight # STEP 4: De-whitening if args.src_dewhiten == 'src': xw = xw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2)) elif args.src_dewhiten == 'trg': xw = xw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2)) if args.trg_dewhiten == 'src': zw = zw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2)) elif args.trg_dewhiten == 'trg': zw = zw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2)) # STEP 5: Dimensionality reduction (default: OFF (0)) if args.dim_reduction > 0: xw = xw[:, :args.dim_reduction] zw = zw[:, :args.dim_reduction] # Self-learning if end: break else: # Update the training dictionary (default direction - union) if args.direction in ('forward', 'union'): if args.csls_neighborhood > 0: # default acl2018: 10 for i in range(0, trg_size, simbwd.shape[0]): j = min(i + simbwd.shape[0], trg_size) # get next batch to operate on zw[i:j].dot(xw[:src_size].T, out=simbwd[:j-i]) knn_sim_bwd[i:j] = topk_mean(simbwd[:j-i], k=args.csls_neighborhood, inplace=True) for i in range(0, src_size, simfwd.shape[0]): j = min(i + simfwd.shape[0], src_size) xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j-i]) simfwd[:j-i].max(axis=1, out=best_sim_forward[i:j]) simfwd[:j-i] -= knn_sim_bwd/2 # Equivalent to the real CSLS scores for NN # softmaxing #argsimsf[:] = dropout(-simfwd[:j-i], 1 - keep_prob).argsort(axis=1)[:,:args.max_align] for k in range(args.max_align): argsimsf = dropout(simfwd[:j-i], 1 - keep_prob).argmax(axis=1) simfwd[:j-i,argsimsf] = -200 trg_indices_forward[(k*src_size)+i:(k*src_size)+j] = argsimsf #trg_indices_forward[(k*src_size)+i:(k*src_size)+j] = argsimsf[:,k] if args.direction in ('backward', 'union'): if args.csls_neighborhood > 0: for i in range(0, src_size, simfwd.shape[0]): j = min(i + simfwd.shape[0], src_size) # get next batch to operate on xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j-i]) knn_sim_fwd[i:j] = topk_mean(simfwd[:j-i], k=args.csls_neighborhood, inplace=True) for i in range(0, trg_size, simbwd.shape[0]): j = min(i + simbwd.shape[0], trg_size) zw[i:j].dot(xw[:src_size].T, out=simbwd[:j-i]) simbwd[:j-i].max(axis=1, out=best_sim_backward[i:j]) simbwd[:j-i] -= knn_sim_fwd/2 # Equivalent to the real CSLS scores for NN # softmaxing #argsimsb[:] = dropout(-simbwd[:j-i], 1 - keep_prob).argsort(axis=1)[:,:args.max_align] for k in range(args.max_align): argsimsb = dropout(simbwd[:j-i], 1 - keep_prob).argmax(axis=1) simbwd[:j-i,argsimsb] = -200 trg_indices_backward[(k*trg_size)+i:(k*trg_size)+j] = argsimsb #src_indices_backward[(k*trg_size)+i:(k*trg_size)+j] = argsimsb[:,k] if args.direction == 'forward': src_indices = src_indices_forward trg_indices = trg_indices_forward elif args.direction == 'backward': src_indices = src_indices_backward trg_indices = trg_indices_backward elif args.direction == 'union': src_indices = xp.concatenate((src_indices_forward, src_indices_backward)) trg_indices = xp.concatenate((trg_indices_forward, trg_indices_backward)) # Objective function evaluation if args.direction == 'forward': objective = xp.mean(best_sim_forward).tolist() elif args.direction == 'backward': objective = xp.mean(best_sim_backward).tolist() elif args.direction == 'union': # default objective = (xp.mean(best_sim_forward) + xp.mean(best_sim_backward)).tolist() / 2 if objective - best_objective >= args.threshold: last_improvement = it best_objective = objective # Accuracy and similarity evaluation in validation (default - off) if args.validation is not None: src = list(validation.keys()) xw[src].dot(zw.T, out=simval) nn = asnumpy(simval.argmax(axis=1)) accuracy = np.mean([1 if nn[i] in validation[src[i]] else 0 for i in range(len(src))]) similarity = np.mean([max([simval[i, j].tolist() for j in validation[src[i]]]) for i in range(len(src))]) # Logging duration = time.time() - t if args.verbose: print(file=sys.stderr) print('ITERATION {0} ({1:.2f}s)'.format(it, duration), file=sys.stderr) print('\t- Objective: {0:9.4f}%'.format(100 * objective), file=sys.stderr) print('\t- Drop probability: {0:9.4f}%'.format(100 - 100*keep_prob), file=sys.stderr) if args.validation is not None: print('\t- Val. similarity: {0:9.4f}%'.format(100 * similarity), file=sys.stderr) print('\t- Val. accuracy: {0:9.4f}%'.format(100 * accuracy), file=sys.stderr) print('\t- Val. coverage: {0:9.4f}%'.format(100 * validation_coverage), file=sys.stderr) sys.stderr.flush() if args.log is not None: val = '{0:.6f}\t{1:.6f}\t{2:.6f}'.format( 100 * similarity, 100 * accuracy, 100 * validation_coverage) if args.validation is not None else '' print('{0}\t{1:.6f}\t{2}\t{3:.6f}'.format(it, 100 * objective, val, duration), file=log) log.flush() t = time.time() it += 1 # Write mapped embeddings srcfile = open(args.src_output, mode='w', encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_output, mode='w', encoding=args.encoding, errors='surrogateescape') embeddings.write(src_words, xw, srcfile) embeddings.write(trg_words, zw, trgfile) srcfile.close() trgfile.close() # Write dictionary dictfile = open(args.dict_output, mode='wb') dictalign = list(zip(src_indices, trg_indices)) pickle.dump(dictalign, dictfile)
trg_words.append(trg) trg_ind = trg_word2ind[trg] src_indices.append(src_ind) trg_indices.append(trg_ind) except KeyError: print('WARNING: OOV dictionary entry ({0} - {1})'.format(src, trg), file=sys.stderr) # origEnVecs=preprocessing.normalize(en_vec) # origForeignVecs=preprocessing.normalize(de_vec) subsetEnVecs = en_vec[src_indices] subsetForeignVecs = de_vec[trg_indices] srcfile = open('en.train', mode='w', encoding='utf-8', errors='surrogateescape') trgfile = open('de.train', mode='w', encoding='utf-8', errors='surrogateescape') embeddings.write(src_words, subsetEnVecs, srcfile) embeddings.write(trg_words, subsetForeignVecs, trgfile) source_file.close() target_file.close() srcfile.close() trgfile.close()
source_file = open('new_embedding_size640.en', encoding='utf-8', errors='surrogateescape') target_file = open('new_embedding_size640.de', encoding='utf-8', errors='surrogateescape') en_words, en_vec = embeddings.read(source_file) de_words, de_vec = embeddings.read(target_file) en_vec = embeddings.length_normalize(en_vec) de_vec = embeddings.length_normalize(de_vec) input_view1, input_view2 = Variable(torch.from_numpy(en_vec).cuda()), Variable(torch.from_numpy(de_vec).cuda()) res_envec, x1, res_devec, x2 = net(input_view1.float(), input_view2.float()) print(x1) src_file = open('BiAE.en', mode='w', encoding='utf-8', errors='surrogateescape') trg_file = open('BiAE.de', mode='w', encoding='utf-8', errors='surrogateescape') # res_envec = embeddings.length_normalize(res_envec.data.cpu().numpy()) # res_devec = embeddings.length_normalize(res_devec.data.cpu().numpy()) res_envec = (res_envec.data.cpu().numpy()) res_devec = (res_devec.data.cpu().numpy()) embeddings.write(en_words, res_envec, src_file) embeddings.write(de_words, res_devec, trg_file) source_file.close() target_file.close() src_file.close() trg_file.close() print('Finished Training')
def main(): # Parse command line arguments parser = argparse.ArgumentParser( description='Map word embeddings in two languages into a shared space') parser.add_argument('src_input', help='the input source embeddings') parser.add_argument('trg_input', help='the input target embeddings') parser.add_argument('sense_input', help='the input sense mapping matrix') parser.add_argument('src_output', help='the output source embeddings') parser.add_argument('trg_output', help='the output target embeddings') parser.add_argument('tsns_output', default='tsns.pkl', help='the output target senses pickle file') parser.add_argument( '--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp32', help='the floating-point precision (defaults to fp32)') parser.add_argument('--cuda', action='store_true', help='use cuda (requires cupy)') parser.add_argument('--seed', type=int, default=0, help='the random seed (defaults to 0)') recommended_group = parser.add_argument_group( 'recommended settings', 'Recommended settings for different scenarios') recommended_type = recommended_group.add_mutually_exclusive_group() recommended_type.add_argument( '--unsupervised', action='store_true', help= 'recommended if you have no seed dictionary and do not want to rely on identical words' ) recommended_type.add_argument('--future', action='store_true', help='experiment with stuff') recommended_type.add_argument('--toy', action='store_true', help='experiment with stuff on toy dataset') recommended_type.add_argument('--acl2018', action='store_true', help='reproduce our ACL 2018 system') init_group = parser.add_argument_group( 'advanced initialization arguments', 'Advanced initialization arguments') init_type = init_group.add_mutually_exclusive_group() init_type.add_argument('--init_unsupervised', action='store_true', help='use unsupervised initialization') init_group.add_argument( '--unsupervised_vocab', type=int, default=0, help= 'restrict the vocabulary to the top k entries for unsupervised initialization' ) mapping_group = parser.add_argument_group( 'advanced mapping arguments', 'Advanced embedding mapping arguments') mapping_group.add_argument( '--normalize', choices=['unit', 'center', 'unitdim', 'centeremb', 'none'], nargs='*', default=[], help='the normalization actions to perform in order') mapping_group.add_argument('--whiten', action='store_true', help='whiten the embeddings') mapping_group.add_argument('--src_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the source language embeddings') mapping_group.add_argument('--trg_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the target language embeddings') mapping_group.add_argument('--src_dewhiten', choices=['src', 'trg'], help='de-whiten the source language embeddings') mapping_group.add_argument('--trg_dewhiten', choices=['src', 'trg'], help='de-whiten the target language embeddings') mapping_group.add_argument('--dim_reduction', type=int, default=0, help='apply dimensionality reduction') mapping_type = mapping_group.add_mutually_exclusive_group() mapping_type.add_argument('-c', '--orthogonal', action='store_true', help='use orthogonal constrained mapping') self_learning_group = parser.add_argument_group( 'advanced self-learning arguments', 'Advanced arguments for self-learning') self_learning_group.add_argument( '--vocabulary_cutoff', type=int, default=0, help='restrict the vocabulary to the top k entries') self_learning_group.add_argument( '--threshold', default=0.000001, type=float, help='the convergence threshold (defaults to 0.000001)') self_learning_group.add_argument( '--stochastic_initial', default=0.1, type=float, help= 'initial keep probability stochastic dictionary induction (defaults to 0.1)' ) self_learning_group.add_argument( '--stochastic_multiplier', default=2.0, type=float, help='stochastic dictionary induction multiplier (defaults to 2.0)') self_learning_group.add_argument( '--stochastic_interval', default=50, type=int, help='stochastic dictionary induction interval (defaults to 50)') self_learning_group.add_argument( '--log', default='map.log', help='write to a log file in tsv format at each iteration') self_learning_group.add_argument( '-v', '--verbose', action='store_true', help='write log information to stderr at each iteration') future_group = parser.add_argument_group('experimental arguments', 'Experimental arguments') future_group.add_argument('--skip_top', type=int, default=0, help='Top k words to skip, presumably function') future_group.add_argument( '--start_src', action='store_true', help='Algorithm starts by tuning sense embeddings based on source') future_group.add_argument('--trim_senses', action='store_true', help='Trim sense table to working vocab') future_group.add_argument( '--lamb', type=float, default=0.5, help='Weight hyperparameter for sense alignment objectives') future_group.add_argument('--reglamb', type=float, default=1., help='Lasso regularization hyperparameter') future_group.add_argument( '--ccreglamb', type=float, default=0.1, help='Sense embedding regularization hyperparameter') future_group.add_argument('--inv_delta', type=float, default=0.0001, help='Delta_I added for inverting sense matrix') future_group.add_argument('--lasso_iters', type=int, default=10, help='Number of iterations for LASSO/NMF') future_group.add_argument('--iterations', type=int, default=-1, help='Number of overall model iterations') future_group.add_argument('--trg_batch', type=int, default=5000, help='Batch size for target steps') future_group.add_argument( '--trg_knn', action='store_true', help='Perform target sense mapping by k-nearest neighbors') future_group.add_argument( '--trg_sns_csls', type=int, default=10, help='K-nearest neighbors for CSLS target sense search') future_group.add_argument( '--senses_per_trg', type=int, default=1, help='K-max target sense mapping (default = 1 = off)') future_group.add_argument( '--gd', action='store_true', help='Apply gradient descent for assignment and synset embeddings') future_group.add_argument('--gd_lr', type=float, default=1e-2, help='Learning rate for SGD (default=0.01)') future_group.add_argument('--gd_wd', action='store_true', help='Weight decay in SGD') future_group.add_argument( '--gd_wd_hl', type=int, default=100, help='Weight decay half-life in SGD, default=100') future_group.add_argument( '--gd_clip', type=float, default=5., help='Per-coordinate gradient clipping (default=5)') future_group.add_argument( '--gd_map_steps', type=int, default=1, help='Consecutive steps for each target-sense mapping update phase') future_group.add_argument( '--gd_emb_steps', type=int, default=1, help='Consecutive steps for each sense embedding update phase') future_group.add_argument( '--base_prox_lambda', type=float, default=0.99, help='Lambda for proximal gradient in lasso step') future_group.add_argument( '--prox_decay', action='store_true', help='Multiply proximal lambda by itself each iteration') future_group.add_argument( '--sense_limit', type=float, default=1.1, help= 'Maximum amount of target sense mappings, in terms of source mappings (default=1.1x)' ) future_group.add_argument( '--gold_pairs', help='Gold data for evaluation, if exists (not for tuning)') future_group.add_argument( '--gold_threshold', type=float, default=0.0, help='Threshold for gold mapping (0 is fine if sparse)') future_group.add_argument('--debug', action='store_true') args = parser.parse_args() # pre-setting groups if args.toy: parser.set_defaults(init_unsupervised=True, unsupervised_vocab=4000, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', vocabulary_cutoff=50, trim_senses=True, inv_delta=1., reglamb=0.2, lasso_iters=100, gd_wd=True, log='map-toy.log') if args.unsupervised or args.future: parser.set_defaults(init_unsupervised=True, unsupervised_vocab=4000, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', vocabulary_cutoff=2000, trim_senses=True, gd_wd=True) if args.unsupervised or args.acl2018: parser.set_defaults(init_unsupervised=True, unsupervised_vocab=4000, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', vocabulary_cutoff=20000) args = parser.parse_args() # Check command line arguments if (args.src_dewhiten is not None or args.trg_dewhiten is not None) and not args.whiten: print('ERROR: De-whitening requires whitening first', file=sys.stderr) sys.exit(-1) # Choose the right dtype for the desired precision if args.precision == 'fp16': dtype = 'float16' # many operations not supported by cupy elif args.precision == 'fp32': # default dtype = 'float32' elif args.precision == 'fp64': dtype = 'float64' # Read input embeddings print('reading embeddings...') srcfile = open(args.src_input, encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_input, encoding=args.encoding, errors='surrogateescape') src_words, x = embeddings.read(srcfile, dtype=dtype) trg_words, z = embeddings.read(trgfile, dtype=dtype) print('embeddings read') # Read input source sense mapping print('reading sense mapping') src_senses = pickle.load(open(args.sense_input, 'rb')) if src_senses.shape[0] != x.shape[0]: src_senses = csr_matrix(src_senses.transpose() ) # using non-cuda scipy because of 'inv' impl #src_senses = get_sparse_module(src_senses) print( f'source sense mapping of shape {src_senses.shape} loaded with {src_senses.getnnz()} nonzeros' ) # NumPy/CuPy management if args.cuda: if not supports_cupy(): print('ERROR: Install CuPy for CUDA support', file=sys.stderr) sys.exit(-1) xp = get_cupy() x = xp.asarray(x) z = xp.asarray(z) print('CUDA loaded') else: xp = np xp.random.seed(args.seed) # removed word to index map (only relevant in supervised learning or with validation) # STEP 0: Normalization embeddings.normalize(x, args.normalize) embeddings.normalize(z, args.normalize) print('normalization complete') # removed building the seed dictionary # removed validation step # Create log file if args.log: log = open(args.log, mode='w', encoding=args.encoding, errors='surrogateescape') print(f'logging into {args.log}') # Allocate memory # Initialize the projection matrices W(s) = W(t) = I. xw = xp.empty_like(x) zw = xp.empty_like(z) xw[:] = x zw[:] = z src_size = x.shape[0] if args.vocabulary_cutoff <= 0 else min( x.shape[0] - args.skip_top, args.vocabulary_cutoff) trg_size = z.shape[0] if args.vocabulary_cutoff <= 0 else min( z.shape[0] - args.skip_top, args.vocabulary_cutoff) emb_dim = x.shape[1] cutoff_end = min(src_size + args.skip_top, x.shape[0]) if args.trim_senses: # reshape sense assignment src_senses = src_senses[args.skip_top:cutoff_end] # new columns for words with no senses in original input ### TODO might also need this if not trimming (probably kinda far away) newcols = [csc_matrix(([1],([i],[0])),shape=(src_size,1)) for i in range(src_size)\ if src_senses.getrow(i).getnnz() == 0] #with open(f'data/synsets/dummy_synsets_v3b_{src_size}','wb') as dummy_cols_file: # dummy_col_idcs = [i for i in range(src_size) if src_senses.getrow(i).getnnz() == 0] # pickle.dump(np.array(dummy_col_idcs), dummy_cols_file) # trim senses no longer used, add new ones colsums = src_senses.sum(axis=0).tolist()[0] kept_senses = [i for i, j in enumerate(colsums) if j > 0] #with open(f'data/synsets/kept_synsets_v3b_{src_size}','wb') as kept_save_file: # pickle.dump(np.array(kept_senses), kept_save_file) src_senses = hstack([src_senses[:, kept_senses]] + newcols) print( f'trimmed sense dictionary dimensions: {src_senses.shape} with {src_senses.getnnz()} nonzeros' ) sense_size = src_senses.shape[1] if args.gold_pairs is not None: with open(args.gold_pairs, 'rb') as gold_pairs_f: gold_pairs = pickle.load(gold_pairs_f) gold_pairs = [(i-args.skip_top,j) for i,j in gold_pairs \ if i >= args.skip_top and i < src_senses.shape[0] and j < src_senses.shape[1]] gold_trgs = sorted(set([x[0] for x in gold_pairs])) gold_senses = sorted(set([x[1] for x in gold_pairs])) gold_domain_size = len(gold_trgs) * len(gold_senses) print( f'evaluating on {len(gold_pairs)} pairs with {len(gold_trgs)} unique words and {len(gold_senses)} unique senses' ) # Initialize the concept embeddings from the source embeddings ### TODO maybe try gradient descent instead? ### TODO (pre-)create non-singular alignment matrix cc = xp.empty((sense_size, emb_dim), dtype=dtype) # \tilde{E} t01 = time.time() print('starting psinv calc') src_sns_psinv = psinv(src_senses, dtype, args.inv_delta) xecc = x[args.skip_top:cutoff_end].T.dot( get_sparse_module(src_senses).toarray()).T # sense_size * emb_dim cc[:] = src_sns_psinv.dot(xecc) print(f'initialized concept embeddings in {time.time()-t01:.2f} seconds', file=sys.stderr) if args.verbose: # report precision of psedo-inverse operation, checked by inverting pseudo_id = src_senses.transpose().dot(src_senses).dot( src_sns_psinv.get()) real_id = sparse_id(sense_size) rel_diff = (pseudo_id - real_id).sum() / (sense_size * sense_size) print(f'per-coordinate pseudo-inverse precision is {rel_diff:.5f}') ### TODO initialize trg_senses using seed dictionary instead? trg_sns_size = trg_size if args.trim_senses else z.shape[0] trg_senses = csr_matrix( (trg_sns_size, sense_size)) # using non-cuda scipy because of 'inv' impl zecc = xp.empty_like(xecc) # sense_size * emb_dim #tg_grad = xp.empty((trg_sns_size, sense_size)) if args.gd: # everything can be done on gpu src_senses = get_sparse_module(src_senses, dtype=dtype) trg_senses = get_sparse_module(trg_senses, dtype=dtype) if args.sense_limit > 0.0: trg_sense_limit = int(args.sense_limit * src_senses.getnnz()) if args.verbose: print( f'limiting target side to {trg_sense_limit} sense mappings' ) else: trg_sense_limit = -1 ### TODO return memory assignment for similarities? # Training loop if args.gd: prox_lambda = args.base_prox_lambda else: lasso_model = Lasso(alpha=args.reglamb, fit_intercept=False, max_iter=args.lasso_iters,\ positive=True, warm_start=True) # TODO more parametrization if args.log is not None: if args.gd: print(f'gradient descent lr: {args.gd_lr}', file=log) print(f'base proximal lambda: {args.base_prox_lambda}', file=log) else: print(f'lasso regularization: {args.reglamb}', file=log) print(f'lasso iterations: {args.lasso_iters}', file=log) print(f'inversion epsilon: {args.inv_delta}', file=log) if args.gold_pairs is not None: print(f'gold mappings: {len(gold_pairs)}', file=log) print( f'Iteration\tObjective\tSource\tTarget\tL_1\tDuration\tNonzeros\tCorrect_mappings', file=log) log.flush() best_objective = objective = 1000000000. correct_mappings = -1 regularization_lambda = args.base_prox_lambda if args.gd else args.reglamb it = 1 last_improvement = 0 t = time.time() map_gd_lr = args.gd_lr emb_gd_lr = args.gd_lr end = False print('starting training') if args.start_src: print('starting with converging synset embeddings') it_range = range( args.iterations ) ### TODO possibly add arg, but there's early stopping if not args.verbose: it_range = tqdm(it_range) prev_obj = float('inf') for pre_it in it_range: if args.gd_wd: emb_gd_lr = args.gd_lr * pow(0.5, floor( pre_it / args.gd_wd_hl)) # Synset embedding cc_grad = src_senses.T.dot( xw[args.skip_top:cutoff_end] - src_senses.dot(cc)) - args.ccreglamb * cc cc_grad.clip(-args.gd_clip, args.gd_clip, out=cc_grad) cc += emb_gd_lr * cc_grad # Source projection u, s, vt = xp.linalg.svd(cc.T.dot(xecc)) wx = vt.T.dot(u.T).astype(dtype) x.dot(wx, out=xw) pre_objective = ((xp.linalg.norm( xw[args.skip_top:cutoff_end] - get_sparse_module(src_senses).dot(cc), 'fro'))**2) / 2 pre_objective = float(pre_objective) if args.verbose and pre_it > 0 and pre_it % 10 == 0: print( f'source synset embedding objective iteration {pre_it}: {pre_objective:.3f}' ) if pre_objective > prev_obj: print( f'stopping at pre-iteration {pre_it}, source-sense objective {prev_obj:.3f}' ) # revert cc -= emb_gd_lr * cc_grad break prev_obj = pre_objective while True: if it % 50 == 0: print( f'starting iteration {it}, last objective was {objective}, correct mappings at {correct_mappings}' ) # Increase the keep probability if we have not improved in args.stochastic_interval iterations if it - last_improvement > args.stochastic_interval: last_improvement = it if args.iterations > 0 and it > args.iterations: end = True ### update target assignments (6) - lasso-esque regression time6 = time.time() # optimize: 0.5 * (xp.linalg.norm(zw[i] - trg_senses[i].dot(cc))^2) + (regularization_lambda * xp.linalg.norm(trg_senses[i],1)) if args.trg_knn: # for csls-based neighborhoods knn_sense = xp.full(sense_size, -100) for i in range(0, sense_size, args.trg_batch): batch_end = min(i + args.trg_batch, sense_size) sim_sense_trg = cc[i:batch_end].dot( zw[args.skip_top:cutoff_end].T) knn_sense[i:batch_end] = topk_mean(sim_sense_trg, k=args.trg_sns_csls, inplace=True) # calculate new target mappings trg_senses = lil_matrix(trg_senses.shape) for i in range(0, trg_size, args.trg_batch): sns_batch_end = min(i + args.trg_batch, trg_size) z_i = i + args.skip_top z_batch_end = min(sns_batch_end + args.skip_top, zw.shape[0]) sims = zw[z_i:z_batch_end].dot(cc.T) sims -= knn_sense / 2 # equivalent to the real CSLS scores for NN best_idcs = sims.argmax(1).tolist() trg_senses[(list(range(i, sns_batch_end)), best_idcs)] = sims.max(1).tolist() # second-to-lth-best for l in range(args.senses_per_trg - 1): sims[(list(range(sims.shape[0])), best_idcs)] = 0. best_idcs = sims.argmax(1).tolist() trg_senses[(list(range(i, sns_batch_end)), best_idcs)] = sims.max(1).tolist() trg_senses = get_sparse_module(trg_senses.tocsr()) elif args.gd: ### TODO add args.skip_top calculations if args.gd_wd: true_it = (it - 1) * args.gd_map_steps map_gd_lr = args.gd_lr * pow( 0.5, floor((1 + true_it) / args.gd_wd_hl)) if args.verbose: print(f'mapping learning rate: {map_gd_lr}') for k in range(args.gd_map_steps): # st <- st + eta * (ew - st.dot(es)).dot(es.T) # allow up to sense_limit updates, clip gradient batch_grads = [] for i in range(0, trg_size, args.trg_batch): batch_end = min(i + args.trg_batch, trg_size) tg_grad_b = (zw[i:batch_end] - trg_senses[i:batch_end].dot(cc)).dot(cc.T) # proximal gradient tg_grad_b += prox_lambda tg_grad_b.clip(None, 0.0, out=tg_grad_b) batch_grads.append(batch_sparse(tg_grad_b)) tg_grad = get_sparse_module(vstack(batch_grads)) del tg_grad_b if args.prox_decay: prox_lambda *= args.base_prox_lambda ### TODO consider weight decay here as well (args.gd_wd) trg_senses -= map_gd_lr * tg_grad # allow up to sense_limit nonzeros if trg_sense_limit > 0: trg_senses = trim_sparse(trg_senses, trg_sense_limit, clip=None) ### TODO consider finishing up with lasso (maybe only in final iteration) else: ### TODO add args.skip_top calculations # parallel LASSO (no cuda impl) cccpu = cc.get().T # emb_dim * sense_size lasso_model.fit(cccpu, zw[:trg_size].get().T) ### TODO maybe trim, keep only above some threshold (0.05) OR top f(#it) trg_senses = lasso_model.sparse_coef_ if args.verbose: print( f'target sense mapping step: {(time.time()-time6):.2f} seconds, {trg_senses.getnnz()} nonzeros', file=sys.stderr) objective = ((xp.linalg.norm(xw[args.skip_top:cutoff_end] - get_sparse_module(src_senses).dot(cc),'fro') ** 2)\ + (xp.linalg.norm(zw[args.skip_top:cutoff_end] - get_sparse_module(trg_senses).dot(cc),'fro')) ** 2) / 2 \ + regularization_lambda * trg_senses.sum() # TODO consider thresholding reg part objective = float(objective) print(f'objective: {objective:.3f}') # Write target sense mapping with open(f'tmp_outs/{args.tsns_output[:-4]}-it{it:03d}.pkl', mode='wb') as tsnsfile: pickle.dump(trg_senses.get(), tsnsfile) ### update synset embeddings (10) time10 = time.time() if args.gd and args.gd_emb_steps > 0: ### TODO probably handle sizes and/or threshold sparse matrix if args.gd_wd: true_it = (it - 1) * args.gd_emb_steps emb_gd_lr = args.gd_lr * pow( 0.5, floor((1 + true_it) / args.gd_wd_hl)) if args.verbose: print(f'embedding learning rate: {emb_gd_lr}') ### replace block for no-source-tuning mode all_senses = trg_senses if args.start_src else get_sparse_module( vstack((src_senses.get(), trg_senses.get()), format='csr'), dtype=dtype) aw = zw[args. skip_top:cutoff_end] if args.start_src else xp.concatenate( (xw[args.skip_top:cutoff_end], zw[args.skip_top:cutoff_end])) for i in range(args.gd_emb_steps): cc_grad = all_senses.T.dot( aw - all_senses.dot(cc)) - args.ccreglamb * cc cc_grad.clip(-args.gd_clip, args.gd_clip, out=cc_grad) cc += emb_gd_lr * cc_grad else: ### TODO add args.skip_top calculations all_senses = get_sparse_module( vstack((src_senses, trg_senses), format='csr')) xzecc = xp.concatenate((xw[:src_size], zw[:trg_size])).T\ .dot(all_senses.toarray()).T # sense_size * emb_dim all_sns_psinv = psinv( all_senses.get(), dtype, args.inv_delta ) ### TODO only update target side? We still have src_sns_psinv [it doesn't matter, dimensions are the same] cc[:] = all_sns_psinv.dot(xzecc) if args.verbose: print(f'synset embedding update: {time.time()-time10:.2f}', file=sys.stderr) objective = ((xp.linalg.norm(xw[args.skip_top:cutoff_end] - get_sparse_module(src_senses).dot(cc),'fro')) ** 2\ + (xp.linalg.norm(zw[args.skip_top:cutoff_end] - get_sparse_module(trg_senses).dot(cc),'fro')) ** 2) / 2 \ + regularization_lambda * trg_senses.sum() # TODO consider thresholding reg part objective = float(objective) print(f'objective: {objective:.3f}') ### update projections (3,5) # write to zw and xw if args.orthogonal or not end: ### remove block for no-source-tuning mode # source side - mappings don't change so xecc is constant #if not args.start_src: # need to do this anyway whenever cc updates time3 = time.time() u, s, vt = xp.linalg.svd(cc.T.dot(xecc)) wx = vt.T.dot(u.T).astype(dtype) x.dot(wx, out=xw) if args.verbose: print(f'source projection update: {time.time()-time3:.2f}', file=sys.stderr) # target side - compute sense mapping first time3 = time.time() zecc.fill(0.) for i in range(0, trg_size, args.trg_batch): end_idx = min(i + args.trg_batch, trg_size) zecc += z[i:end_idx].T.dot( get_sparse_module(trg_senses[i:end_idx]).toarray()).T u, s, vt = xp.linalg.svd(cc.T.dot(zecc)) wz = vt.T.dot(u.T).astype(dtype) z.dot(wz, out=zw) if args.verbose: print(f'target projection update: {time.time()-time3:.2f}', file=sys.stderr) ### TODO add parts from 'advanced mapping' part - transformations, whitening, etc. # Objective function evaluation time_obj = time.time() trg_senses_l1 = float(trg_senses.sum()) src_obj = (float( xp.linalg.norm( xw[args.skip_top:cutoff_end] - get_sparse_module(src_senses).dot(cc), 'fro'))**2) / 2 trg_obj = (float( xp.linalg.norm( zw[args.skip_top:cutoff_end] - get_sparse_module(trg_senses).dot(cc), 'fro'))**2) / 2 objective = src_obj + trg_obj + regularization_lambda * trg_senses_l1 # TODO consider thresholding reg part if args.verbose: print(f'objective calculation: {time.time()-time_obj:.2f}', file=sys.stderr) if objective - best_objective <= -args.threshold: last_improvement = it best_objective = objective # WordNet transduction evaluation (can't tune on this) if args.gold_pairs is not None: np_trg_senses = trg_senses.get() trg_corr = [ p for p in gold_pairs if np_trg_senses[p] > args.gold_threshold ] correct_mappings = len(trg_corr) domain_trgs = np_trg_senses[gold_trgs][:, gold_senses] else: correct_mappings = -1 # Logging duration = time.time() - t if args.verbose: print('ITERATION {0} ({1:.2f}s)'.format(it, duration), file=sys.stderr) print('objective: {0:.3f}'.format(objective), file=sys.stderr) print('target senses l_1 norm: {0:.3f}'.format(trg_senses_l1), file=sys.stderr) if len(gold_pairs) > 0 and domain_trgs.getnnz() > 0: print( f'{correct_mappings} correct target mappings: {(correct_mappings/len(gold_pairs)):.3f} recall, {(correct_mappings/domain_trgs.getnnz()):.3f} precision', file=sys.stderr) print(file=sys.stderr) sys.stderr.flush() if args.log is not None: print( f'{it}\t{objective:.3f}\t{src_obj:.3f}\t{trg_obj:.3f}\t{trg_senses_l1:.3f}\t{duration:.3f}\t{trg_senses.getnnz()}\t{correct_mappings}', file=log) log.flush() if end: break t = time.time() it += 1 # Write mapped embeddings with open(args.src_output, mode='w', encoding=args.encoding, errors='surrogateescape') as srcfile: embeddings.write(src_words, xw, srcfile) with open(args.trg_output, mode='w', encoding=args.encoding, errors='surrogateescape') as trgfile: embeddings.write(trg_words, zw, trgfile) # Write target sense mapping with open(args.tsns_output, mode='wb') as tsnsfile: pickle.dump(trg_senses.get(), tsnsfile)
def main(): # Parse command line arguments # https://docs.python.org/3/library/argparse.html parser = argparse.ArgumentParser( description='Map word embeddings in two languages into a shared space') # description - This argument gives a brief description of what the program does and how it works. parser.add_argument('src_input', help='the input source embeddings') # help - A brief description of what the argument does. parser.add_argument('trg_input', help='the input target embeddings') parser.add_argument('src_output', help='the output source embeddings') parser.add_argument('trg_output', help='the output target embeddings') parser.add_argument( '--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') # -- optional # default - The value produced if the argument is absent from the command line. parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp32', help='the floating-point precision (defaults to fp32)') # choices - A container of the allowable values for the argument. parser.add_argument('--cuda', action='store_true', help='use cuda (requires cupy)') # action - The basic type of action to be taken when this argument is encountered at the command line. # store_ture - store true value parser.add_argument( '--batch_size', default=1000, type=int, help= 'batch size (defaults to 10000); does not affect results, larger is usually faster but uses more memory' ) parser.add_argument('--seed', type=int, default=0, help='the random seed (defaults to 0)') parser.add_argument('--draw', action='store_true', help='use seaborn to draw') recommended_group = parser.add_argument_group( 'recommended settings', 'Recommended settings for different scenarios') # add_argument_group() - returns an argument group object which has an add_argument() method just like a regular ArgumentParser. # it's a better conceptual grouping of arguments than this default one recommended_type = recommended_group.add_mutually_exclusive_group() # argparse will make sure that only one of the arguments in the mutually exclusive group was present on the command line recommended_type.add_argument( '--supervised', metavar='DICTIONARY', help='recommended if you have a large training dictionary') recommended_type.add_argument( '--semi_supervised', metavar='DICTIONARY', help='recommended if you have a small seed dictionary') recommended_type.add_argument( '--identical', action='store_true', help= 'recommended if you have no seed dictionary but can rely on identical words' ) recommended_type.add_argument( '--unsupervised', action='store_true', help= 'recommended if you have no seed dictionary and do not want to rely on identical words' ) recommended_type.add_argument('--acl2018', action='store_true', help='reproduce our ACL 2018 system') recommended_type.add_argument('--aaai2018', metavar='DICTIONARY', help='reproduce our AAAI 2018 system') # A name for the argument in usage messages recommended_type.add_argument( '--acl2017', action='store_true', help='reproduce our ACL 2017 system with numeral initialization') recommended_type.add_argument( '--acl2017_seed', metavar='DICTIONARY', help='reproduce our ACL 2017 system with a seed dictionary') recommended_type.add_argument('--emnlp2016', metavar='DICTIONARY', help='reproduce our EMNLP 2016 system') init_group = parser.add_argument_group( 'advanced initialization arguments', 'Advanced initialization arguments') init_type = init_group.add_mutually_exclusive_group() init_type.add_argument( '-d', '--init_dictionary', default=sys.stdin.fileno(), metavar='DICTIONARY', help='the training dictionary file (defaults to stdin)') init_type.add_argument('--init_identical', action='store_true', help='use identical words as the seed dictionary') init_type.add_argument( '--init_numerals', action='store_true', help= 'use latin numerals (i.e. words matching [0-9]+) as the seed dictionary' ) init_type.add_argument('--init_unsupervised', action='store_true', help='use unsupervised initialization') init_group.add_argument( '--unsupervised_vocab', type=int, default=0, help= 'restrict the vocabulary to the top k entries for unsupervised initialization' ) mapping_group = parser.add_argument_group( 'advanced mapping arguments', 'Advanced embedding mapping arguments') mapping_group.add_argument( '--normalize', choices=['unit', 'center', 'unitdim', 'centeremb', 'none'], nargs='*', default=[], help='the normalization actions to perform in order') # no normalization in default mapping_group.add_argument('--whiten', action='store_true', help='whiten the embeddings') mapping_group.add_argument('--src_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the source language embeddings') mapping_group.add_argument('--trg_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the target language embeddings') mapping_group.add_argument('--src_dewhiten', choices=['src', 'trg'], help='de-whiten the source language embeddings') mapping_group.add_argument('--trg_dewhiten', choices=['src', 'trg'], help='de-whiten the target language embeddings') mapping_group.add_argument('--dim_reduction', type=int, default=0, help='apply dimensionality reduction') mapping_type = mapping_group.add_mutually_exclusive_group() mapping_type.add_argument('-c', '--orthogonal', action='store_true', help='use orthogonal constrained mapping') mapping_type.add_argument('-u', '--unconstrained', action='store_true', help='use unconstrained mapping') self_learning_group = parser.add_argument_group( 'advanced self-learning arguments', 'Advanced arguments for self-learning') self_learning_group.add_argument('--self_learning', action='store_true', help='enable self-learning') self_learning_group.add_argument( '--vocabulary_cutoff', type=int, default=0, help='restrict the vocabulary to the top k entries') self_learning_group.add_argument( '--direction', choices=['forward', 'backward', 'union'], default='union', help='the direction for dictionary induction (defaults to union)') self_learning_group.add_argument('--csls', type=int, nargs='?', default=0, const=10, metavar='NEIGHBORHOOD_SIZE', dest='csls_neighborhood', help='use CSLS for dictionary induction') self_learning_group.add_argument( '--threshold', default=0.000001, type=float, help='the convergence threshold (defaults to 0.000001)') self_learning_group.add_argument( '--validation', default=None, metavar='DICTIONARY', help='a dictionary file for validation at each iteration') self_learning_group.add_argument( '--stochastic_initial', default=0.1, type=float, help= 'initial keep probability stochastic dictionary induction (defaults to 0.1)' ) self_learning_group.add_argument( '--stochastic_multiplier', default=2.0, type=float, help='stochastic dictionary induction multiplier (defaults to 2.0)') self_learning_group.add_argument( '--stochastic_interval', default=50, type=int, help='stochastic dictionary induction interval (defaults to 50)') self_learning_group.add_argument( '--log', help='write to a log file in tsv format at each iteration') self_learning_group.add_argument( '-v', '--verbose', action='store_true', help='write log information to stderr at each iteration') args = parser.parse_args() if args.supervised is not None: parser.set_defaults(init_dictionary=args.supervised, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', batch_size=1000) if args.semi_supervised is not None: parser.set_defaults(init_dictionary=args.semi_supervised, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10) if args.identical: parser.set_defaults(init_identical=True, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10) if args.unsupervised or args.acl2018: parser.set_defaults(init_unsupervised=True, unsupervised_vocab=4000, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10) if args.aaai2018: parser.set_defaults(init_dictionary=args.aaai2018, normalize=['unit', 'center'], whiten=True, trg_reweight=1, src_dewhiten='src', trg_dewhiten='trg', batch_size=1000) if args.acl2017: parser.set_defaults(init_numerals=True, orthogonal=True, normalize=['unit', 'center'], self_learning=True, direction='forward', stochastic_initial=1.0, stochastic_interval=1, batch_size=1000) if args.acl2017_seed: parser.set_defaults(init_dictionary=args.acl2017_seed, orthogonal=True, normalize=['unit', 'center'], self_learning=True, direction='forward', stochastic_initial=1.0, stochastic_interval=1, batch_size=1000) if args.emnlp2016: parser.set_defaults(init_dictionary=args.emnlp2016, orthogonal=True, normalize=['unit', 'center'], batch_size=1000) args = parser.parse_args() # Check command line arguments if (args.src_dewhiten is not None or args.trg_dewhiten is not None) and not args.whiten: print('ERROR: De-whitening requires whitening first', file=sys.stderr) sys.exit(-1) # Choose the right dtype for the desired precision if args.precision == 'fp16': dtype = 'float16' elif args.precision == 'fp32': dtype = 'float32' elif args.precision == 'fp64': dtype = 'float64' # Read input embeddings srcfile = open(args.src_input, encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_input, encoding=args.encoding, errors='surrogateescape') src_words, x = embeddings.read(srcfile, dtype=dtype) trg_words, z = embeddings.read(trgfile, dtype=dtype) # NumPy/CuPy management if args.cuda: if not supports_cupy(): print('ERROR: Install CuPy for CUDA support', file=sys.stderr) sys.exit(-1) xp = get_cupy() x = xp.asarray(x) z = xp.asarray(z) else: xp = np # fix random seed xp.random.seed(args.seed) # Build word to index map src_word2ind = {word: i for i, word in enumerate(src_words)} trg_word2ind = {word: i for i, word in enumerate(trg_words)} # STEP 0: Normalization embeddings.normalize(x, args.normalize) embeddings.normalize(z, args.normalize) # Build the seed dictionary src_indices = [] trg_indices = [] dict_size = 5000 if args.init_unsupervised: sim_size = min(x.shape[0], z.shape[0]) if args.unsupervised_vocab <= 0 else min( x.shape[0], z.shape[0], args.unsupervised_vocab) u, s, vt = xp.linalg.svd(x[:sim_size], full_matrices=False) xsim = (u * s).dot(u.T) u, s, vt = xp.linalg.svd(z[:sim_size], full_matrices=False) zsim = (u * s).dot(u.T) del u, s, vt xsim.sort(axis=1) zsim.sort(axis=1) embeddings.normalize(xsim, args.normalize) embeddings.normalize(zsim, args.normalize) sim = xsim.dot(zsim.T) if args.csls_neighborhood > 0: knn_sim_fwd = topk_mean(sim, k=args.csls_neighborhood) knn_sim_bwd = topk_mean(sim.T, k=args.csls_neighborhood) sim -= knn_sim_fwd[:, xp.newaxis] / 2 + knn_sim_bwd / 2 if args.direction == 'forward': src_indices = xp.arange(sim_size) trg_indices = sim.argmax(axis=1) elif args.direction == 'backward': src_indices = sim.argmax(axis=0) trg_indices = xp.arange(sim_size) elif args.direction == 'union': src_indices = xp.concatenate( (xp.arange(sim_size), sim.argmax(axis=0))) trg_indices = xp.concatenate( (sim.argmax(axis=1), xp.arange(sim_size))) del xsim, zsim, sim elif args.init_numerals: numeral_regex = re.compile('^[0-9]+$') # ^ match from start of words $ match to end of words # consider numbers from 0 to 9 # http://www.runoob.com/python/python-reg-expressions.html src_numerals = { word for word in src_words if numeral_regex.match(word) is not None } trg_numerals = { word for word in trg_words if numeral_regex.match(word) is not None } numerals = src_numerals.intersection(trg_numerals) for word in numerals: src_indices.append(src_word2ind[word]) trg_indices.append(trg_word2ind[word]) elif args.init_identical: identical = set(src_words).intersection(set(trg_words)) for word in identical: src_indices.append(src_word2ind[word]) trg_indices.append(trg_word2ind[word]) else: f = open(args.init_dictionary, encoding=args.encoding, errors='surrogateescape') for line in f: src, trg = line.split() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src_indices.append(src_ind) trg_indices.append(trg_ind) except KeyError: print('WARNING: OOV dictionary entry ({0} - {1})'.format( src, trg), file=sys.stderr) if len(src_indices) == dict_size: break # Read validation dictionary if args.validation is not None: f = open(args.validation, encoding=args.encoding, errors='surrogateescape') validation = collections.defaultdict(set) oov = set() vocab = set() for line in f: src, trg = line.split() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] validation[src_ind].add(trg_ind) vocab.add(src) except KeyError: oov.add(src) oov -= vocab # If one of the translation options is in the vocabulary, then the entry is not an oov validation_coverage = len(validation) / (len(validation) + len(oov)) # Create log file if args.log: log = open(args.log, mode='w', encoding=args.encoding, errors='surrogateescape') # Allocate memory xw = xp.empty_like(x) zw = xp.empty_like(z) # choose to cut-off or not src_size = x.shape[0] if args.vocabulary_cutoff <= 0 else min( x.shape[0], args.vocabulary_cutoff) trg_size = z.shape[0] if args.vocabulary_cutoff <= 0 else min( z.shape[0], args.vocabulary_cutoff) simfwd = xp.empty((args.batch_size, trg_size), dtype=dtype) simbwd = xp.empty((args.batch_size, src_size), dtype=dtype) if args.validation is not None: simval = xp.empty((len(validation.keys()), z.shape[0]), dtype=dtype) best_sim_forward = xp.full(src_size, -100, dtype=dtype) src_indices_forward = xp.arange(src_size) trg_indices_forward = xp.zeros(src_size, dtype=int) best_sim_backward = xp.full(trg_size, -100, dtype=dtype) src_indices_backward = xp.zeros(trg_size, dtype=int) trg_indices_backward = xp.arange(trg_size) knn_sim_fwd = xp.zeros(src_size, dtype=dtype) knn_sim_bwd = xp.zeros(trg_size, dtype=dtype) # Training loop best_objective = objective = -100. it = 1 last_improvement = 0 keep_prob = args.stochastic_initial t = time.time() end = not args.self_learning while True: # Increase the keep probability if we have not improve in args.stochastic_interval iterations # for init-numeral : if objective doesn's increase after 1 iteration, then stop it directly if it - last_improvement > args.stochastic_interval: if keep_prob >= 1.0: end = True keep_prob = min(1.0, args.stochastic_multiplier * keep_prob) last_improvement = it # Update the embedding mapping if args.orthogonal or not end: # orthogonal mapping u, s, vt = xp.linalg.svd(z[trg_indices].T.dot(x[src_indices])) w = vt.T.dot(u.T) x.dot(w, out=xw) zw[:] = z elif args.unconstrained: # unconstrained mapping x_pseudoinv = xp.linalg.inv(x[src_indices].T.dot( x[src_indices])).dot(x[src_indices].T) w = x_pseudoinv.dot(z[trg_indices]) x.dot(w, out=xw) zw[:] = z else: # advanced mapping # TODO xw.dot(wx2, out=xw) and alike not working xw[:] = x zw[:] = z # STEP 1: Whitening def whitening_transformation(m): u, s, vt = xp.linalg.svd(m, full_matrices=False) return vt.T.dot(xp.diag(1 / s)).dot(vt) if args.whiten: wx1 = whitening_transformation(xw[src_indices]) wz1 = whitening_transformation(zw[trg_indices]) xw = xw.dot(wx1) zw = zw.dot(wz1) # STEP 2: Orthogonal mapping wx2, s, wz2_t = xp.linalg.svd(xw[src_indices].T.dot( zw[trg_indices])) wz2 = wz2_t.T xw = xw.dot(wx2) zw = zw.dot(wz2) # STEP 3: Re-weighting xw *= s**args.src_reweight zw *= s**args.trg_reweight # STEP 4: De-whitening if args.src_dewhiten == 'src': xw = xw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2)) elif args.src_dewhiten == 'trg': xw = xw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2)) if args.trg_dewhiten == 'src': zw = zw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2)) elif args.trg_dewhiten == 'trg': zw = zw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2)) # STEP 5: Dimensionality reduction if args.dim_reduction > 0: xw = xw[:, :args.dim_reduction] zw = zw[:, :args.dim_reduction] # Self-learning if end: break else: # Update the training dictionary if args.direction in ('forward', 'union'): if args.csls_neighborhood > 0: for i in range(0, trg_size, simbwd.shape[0]): j = min(i + simbwd.shape[0], trg_size) zw[i:j].dot(xw[:src_size].T, out=simbwd[:j - i]) knn_sim_bwd[i:j] = topk_mean(simbwd[:j - i], k=args.csls_neighborhood, inplace=True) for i in range(0, src_size, simfwd.shape[0]): j = min(i + simfwd.shape[0], src_size) xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j - i]) simfwd[:j - i].max(axis=1, out=best_sim_forward[i:j]) simfwd[:j - i] -= knn_sim_bwd / 2 # Equivalent to the real CSLS scores for NN dropout(simfwd[:j - i], 1 - keep_prob).argmax(axis=1, out=trg_indices_forward[i:j]) if args.direction in ('backward', 'union'): if args.csls_neighborhood > 0: for i in range(0, src_size, simfwd.shape[0]): j = min(i + simfwd.shape[0], src_size) xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j - i]) knn_sim_fwd[i:j] = topk_mean(simfwd[:j - i], k=args.csls_neighborhood, inplace=True) for i in range(0, trg_size, simbwd.shape[0]): j = min(i + simbwd.shape[0], trg_size) zw[i:j].dot(xw[:src_size].T, out=simbwd[:j - i]) simbwd[:j - i].max(axis=1, out=best_sim_backward[i:j]) simbwd[:j - i] -= knn_sim_fwd / 2 # Equivalent to the real CSLS scores for NN dropout(simbwd[:j - i], 1 - keep_prob).argmax( axis=1, out=src_indices_backward[i:j]) if args.direction == 'forward': src_indices = src_indices_forward trg_indices = trg_indices_forward elif args.direction == 'backward': src_indices = src_indices_backward trg_indices = trg_indices_backward elif args.direction == 'union': src_indices = xp.concatenate( (src_indices_forward, src_indices_backward)) trg_indices = xp.concatenate( (trg_indices_forward, trg_indices_backward)) # Objective function evaluation if args.direction == 'forward': objective = xp.mean(best_sim_forward).tolist() elif args.direction == 'backward': objective = xp.mean(best_sim_backward).tolist() elif args.direction == 'union': objective = (xp.mean(best_sim_forward) + xp.mean(best_sim_backward)).tolist() / 2 if objective - best_objective >= args.threshold: last_improvement = it best_objective = objective # Accuracy and similarity evaluation in validation if args.validation is not None: src = list(validation.keys()) xw[src].dot(zw.T, out=simval) nn = asnumpy(simval.argmax(axis=1)) accuracy = np.mean([ 1 if nn[i] in validation[src[i]] else 0 for i in range(len(src)) ]) similarity = np.mean([ max([simval[i, j].tolist() for j in validation[src[i]]]) for i in range(len(src)) ]) # Logging duration = time.time() - t if args.verbose: print(file=sys.stderr) print('ITERATION {0} ({1:.2f}s)'.format(it, duration), file=sys.stderr) print('\t- Objective: {0:9.4f}%'.format(100 * objective), file=sys.stderr) print( '\t- Drop probability: {0:9.4f}%'.format(100 - 100 * keep_prob), file=sys.stderr) if args.validation is not None: print('\t- Val. similarity: {0:9.4f}%'.format(100 * similarity), file=sys.stderr) print('\t- Val. accuracy: {0:9.4f}%'.format(100 * accuracy), file=sys.stderr) print('\t- Val. coverage: {0:9.4f}%'.format( 100 * validation_coverage), file=sys.stderr) sys.stderr.flush() if args.log is not None: val = '{0:.6f}\t{1:.6f}\t{2:.6f}'.format( 100 * similarity, 100 * accuracy, 100 * validation_coverage) if args.validation is not None else '' print('{0}\t{1:.6f}\t{2}\t{3:.6f}'.format( it, 100 * objective, val, duration), file=log) log.flush() t = time.time() it += 1 # draw distribution of language space if args.draw: PCA_model = PCA(n_components=2) x_PCA = PCA_model.fit_transform(asnumpy(xw)) x1 = [feature[0] for feature in x_PCA] y1 = [feature[1] for feature in x_PCA] z_PCA = PCA_model.fit_transform(asnumpy(zw)) x2 = [feature[0] for feature in z_PCA] y2 = [feature[1] for feature in z_PCA] ''' # draw with plt plt.scatter(x2, y2, s=10, c='r', alpha=0.4) plt.scatter(x1, y1, s=10, c='b', alpha=0.2) plt.savefig('./share_space.png') ''' # draw with seaborn plt.figure() sns.jointplot(x1, y1, kind='hex', color='b') plt.savefig('./src_mapped_emb.png') plt.figure() sns.jointplot(x2, y2, kind='hex', color='g') plt.savefig('./trg_mapped_emb.png') # Write mapped embeddings srcfile = open(args.src_output, mode='w', encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_output, mode='w', encoding=args.encoding, errors='surrogateescape') embeddings.write(src_words, xw, srcfile) embeddings.write(trg_words, zw, trgfile) srcfile.close() trgfile.close()
def main(): # Parse command line arguments parser = argparse.ArgumentParser( description='Generate latent space embeddings') parser.add_argument('emb1', help='path to embedding 1') parser.add_argument('emb2', help='path to embedding 2') parser.add_argument( '--geomm_embeddings_path', default=None, type=str, help= 'directory to save the output GeoMM latent space embeddings. The output embeddings are normalized.' ) parser.add_argument( '--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') parser.add_argument('--verbose', default=0, type=int, help='Verbose') mapping_group = parser.add_argument_group( 'mapping arguments', 'Basic embedding mapping arguments') mapping_group.add_argument('--dictionary', default=sys.stdin.fileno(), help='the dictionary file (defaults to stdin)') mapping_group.add_argument( '--normalize', choices=['unit', 'center', 'unitdim', 'centeremb', 'no'], nargs=2, default=[], help= 'the normalization actions performed in sequence for embeddings 1 and 2' ) geomm_group = parser.add_argument_group('GeoMM arguments', 'Arguments for GeoMM method') geomm_group.add_argument('--l2_reg', type=float, default=1e2, help='Lambda for L2 Regularization') geomm_group.add_argument( '--max_opt_time', type=int, default=5000, help='Maximum time limit for optimization in seconds') geomm_group.add_argument( '--max_opt_iter', type=int, default=150, help='Maximum number of iterations for optimization') args = parser.parse_args() if args.verbose: print('Current arguments: {0}'.format(args)) dtype = 'float32' if args.verbose: print('Loading embeddings data...') # Read input embeddings emb1file = open(args.emb1, encoding=args.encoding, errors='surrogateescape') emb2file = open(args.emb2, encoding=args.encoding, errors='surrogateescape') emb1_words, x = embeddings.read(emb1file, max_voc=0, dtype=dtype) emb2_words, z = embeddings.read(emb2file, max_voc=0, dtype=dtype) # Build word to index map emb1_word2ind = {word: i for i, word in enumerate(emb1_words)} emb2_word2ind = {word: i for i, word in enumerate(emb2_words)} noov = 0 emb1_indices = [] emb2_indices = [] f = open(args.dictionary, encoding=args.encoding, errors='surrogateescape') for line in f: emb1, emb2 = line.split() try: emb1_ind = emb1_word2ind[emb1] emb2_ind = emb2_word2ind[emb2] emb1_indices.append(emb1_ind) emb2_indices.append(emb2_ind) except KeyError: noov += 1 if args.verbose: print('WARNING: OOV dictionary entry ({0} - {1})'.format( emb1, emb2)) #, file=sys.stderr f.close() if args.verbose: print('Number of embedding pairs having at least one OOV: {}'.format( noov)) emb1_indices = emb1_indices emb2_indices = emb2_indices if args.verbose: print('Normalizing embeddings...') # STEP 0: Normalization if len(args.normalize) > 0: x = normalize_emb(x, args.normalize[0]) z = normalize_emb(z, args.normalize[1]) # Step 1: Optimization if args.verbose: print('Beginning Optimization') start_time = time.time() x_count = len(set(emb1_indices)) z_count = len(set(emb2_indices)) # Filter out uniq values map_dict_emb1 = {} map_dict_emb2 = {} I = 0 uniq_emb1 = [] uniq_emb2 = [] for i in range(len(emb1_indices)): if emb1_indices[i] not in map_dict_emb1.keys(): map_dict_emb1[emb1_indices[i]] = I I += 1 uniq_emb1.append(emb1_indices[i]) J = 0 for j in range(len(emb2_indices)): if emb2_indices[j] not in map_dict_emb2.keys(): map_dict_emb2[emb2_indices[j]] = J J += 1 uniq_emb2.append(emb2_indices[j]) # Creating dictionary matrix row = list(range(0, x_count)) col = list(range(0, x_count)) data = [1 for i in range(0, x_count)] print(f"Counts: {x_count}, {z_count}") A = coo_matrix((data, (row, col)), shape=(x_count, z_count)) np.random.seed(0) Lambda = args.l2_reg U1 = TT.matrix() U2 = TT.matrix() B = TT.matrix() Xemb1 = x[uniq_emb1] Zemb2 = z[uniq_emb2] del x, z gc.collect() Kx, Kz = Xemb1, Zemb2 XtAZ = Kx.T.dot(A.dot(Kz)) XtX = Kx.T.dot(Kx) ZtZ = Kz.T.dot(Kz) AA = np.sum(A * A) W = (U1.dot(B)).dot(U2.T) regularizer = 0.5 * Lambda * (TT.sum(B**2)) sXtX = shared(XtX) sZtZ = shared(ZtZ) sXtAZ = shared(XtAZ) cost = regularizer wtxtxw = W.T.dot(sXtX.dot(W)) wtxtxwztz = wtxtxw.dot(sZtZ) cost += TT.nlinalg.trace(wtxtxwztz) cost += -2 * TT.sum(W * sXtAZ) cost += shared(AA) solver = ConjugateGradient(maxtime=args.max_opt_time, maxiter=args.max_opt_iter) manifold = Product([ Stiefel(Kx.shape[1], Kx.shape[1]), Stiefel(Kz.shape[1], Kz.shape[1]), PositiveDefinite(Kx.shape[1]) ]) problem = Problem(manifold=manifold, cost=cost, arg=[U1, U2, B], verbosity=3) wopt = solver.solve(problem) print(f"Problem solved ...") w = wopt U1 = w[0] U2 = w[1] B = w[2] print(f"Model copied ...") gc.collect() # Step 2: Transformation xw = Kx.dot(U1).dot(scipy.linalg.sqrtm(B)) zw = Kz.dot(U2).dot(scipy.linalg.sqrtm(B)) print(f"Transformation done ...") end_time = time.time() if args.verbose: print('Completed training in {0:.2f} seconds'.format(end_time - start_time)) del Kx, Kz, B, U1, U2 gc.collect() ### Save the GeoMM embeddings if requested xw_n = embeddings.length_normalize(xw) zw_n = embeddings.length_normalize(zw) del xw, zw gc.collect() if args.geomm_embeddings_path is not None: os.makedirs(args.geomm_embeddings_path, exist_ok=True) out_emb_fname = os.path.join(args.geomm_embeddings_path, 'emb1.vec') new_emb1_words = [] for id in uniq_emb1: new_emb1_words.append(emb1_words[id]) with open(out_emb_fname, 'w', encoding=args.encoding) as outfile: embeddings.write(new_emb1_words, xw_n, outfile) new_emb2_words = [] for id in uniq_emb2: new_emb2_words.append(emb2_words[id]) out_emb_fname = os.path.join(args.geomm_embeddings_path, 'emb2.vec') with open(out_emb_fname, 'w', encoding=args.encoding) as outfile: embeddings.write(new_emb2_words, zw_n, outfile) exit(0)
en_vec = embeddings.length_normalize(en_vec) de_vec = embeddings.length_normalize(de_vec) input_view1, input_view2 = Variable( torch.from_numpy(en_vec).cuda()), Variable( torch.from_numpy(de_vec).cuda()) res_envec = net(input_view1.float()) src_file = open('LinearMappingres.en', mode='w', encoding='utf-8', errors='surrogateescape') trg_file = open('LinearMappingres.de', mode='w', encoding='utf-8', errors='surrogateescape') res_envec = embeddings.length_normalize(res_envec.data.cpu().numpy()) embeddings.write(en_words, res_envec, src_file) embeddings.write(de_words, input_view2.float().data.cpu().numpy(), trg_file) source_file.close() target_file.close() src_file.close() trg_file.close() print('Finished Training') # print(net.view1_fc.weight.data)
def share_embedding(words, matrix): f = open('word.txt', mode='w', encoding='utf-8', errors='surrogateescape') embeddings.write(words, matrix, f)
def main(): # Parse command line arguments parser = argparse.ArgumentParser(description='Map the source embeddings into the target embedding space') parser.add_argument('src_input', help='the input source embeddings') parser.add_argument('trg_input', help='the input target embeddings') parser.add_argument('--model_path', default=None, type=str, help='directory to save the model') parser.add_argument('--geomm_embeddings_path', default=None, type=str, help='directory to save the output GeoMM latent space embeddings. The output embeddings are normalized.') parser.add_argument('--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') parser.add_argument('--max_vocab', default=0,type=int, help='Maximum vocabulary to be loaded, 0 allows complete vocabulary') parser.add_argument('--verbose', default=0,type=int, help='Verbose') mapping_group = parser.add_argument_group('mapping arguments', 'Basic embedding mapping arguments') mapping_group.add_argument('-dtrain', '--dictionary_train', default=sys.stdin.fileno(), help='the training dictionary file (defaults to stdin)') mapping_group.add_argument('-dtest', '--dictionary_test', default=sys.stdin.fileno(), help='the test dictionary file (defaults to stdin)') mapping_group.add_argument('--normalize', choices=['unit', 'center', 'unitdim', 'centeremb'], nargs='*', default=[], help='the normalization actions to perform in order') geomm_group = parser.add_argument_group('GeoMM arguments', 'Arguments for GeoMM method') geomm_group.add_argument('--l2_reg', type=float,default=1e2, help='Lambda for L2 Regularization') geomm_group.add_argument('--max_opt_time', type=int,default=5000, help='Maximum time limit for optimization in seconds') geomm_group.add_argument('--max_opt_iter', type=int,default=150, help='Maximum number of iterations for optimization') eval_group = parser.add_argument_group('evaluation arguments', 'Arguments for evaluation') eval_group.add_argument('--normalize_eval', action='store_true', help='Normalize the embeddings at test time') eval_group.add_argument('--eval_batch_size', type=int,default=1000, help='Batch size for evaluation') eval_group.add_argument('--csls_neighbourhood', type=int,default=10, help='Neighbourhood size for CSLS') args = parser.parse_args() BATCH_SIZE = args.eval_batch_size ## Logging #method_name = os.path.join('logs','geomm') #directory = os.path.join(os.path.join(os.getcwd(),method_name), datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) #if not os.path.exists(directory): # os.makedirs(directory) #log_file_name, file_extension = os.path.splitext(os.path.basename(args.dictionary_train)) #log_file_name = log_file_name + '.log' #class Logger(object): # def __init__(self): # self.terminal = sys.stdout # self.log = open(os.path.join(directory,log_file_name), "a") # def write(self, message): # self.terminal.write(message) # self.log.write(message) # def flush(self): # #this flush method is needed for python 3 compatibility. # #this handles the flush command by doing nothing. # #you might want to specify some extra behavior here. # pass #sys.stdout = Logger() if args.verbose: print('Current arguments: {0}'.format(args)) dtype = 'float32' if args.verbose: print('Loading train data...') # Read input embeddings srcfile = open(args.src_input, encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_input, encoding=args.encoding, errors='surrogateescape') src_words, x = embeddings.read(srcfile,max_voc=args.max_vocab, dtype=dtype) trg_words, z = embeddings.read(trgfile,max_voc=args.max_vocab, dtype=dtype) # Build word to index map src_word2ind = {word: i for i, word in enumerate(src_words)} trg_word2ind = {word: i for i, word in enumerate(trg_words)} # Build training dictionary noov=0 src_indices = [] trg_indices = [] f = open(args.dictionary_train, encoding=args.encoding, errors='surrogateescape') for line in f: src,trg = line.split() if args.max_vocab: src=src.lower() trg=trg.lower() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src_indices.append(src_ind) trg_indices.append(trg_ind) except KeyError: noov+=1 if args.verbose: print('WARNING: OOV dictionary entry ({0} - {1})'.format(src, trg)) #, file=sys.stderr f.close() if args.verbose: print('Number of training pairs having at least one OOV: {}'.format(noov)) src_indices = src_indices trg_indices = trg_indices if args.verbose: print('Normalizing embeddings...') # STEP 0: Normalization for action in args.normalize: if action == 'unit': x = embeddings.length_normalize(x) z = embeddings.length_normalize(z) elif action == 'center': x = embeddings.mean_center(x) z = embeddings.mean_center(z) elif action == 'unitdim': x = embeddings.length_normalize_dimensionwise(x) z = embeddings.length_normalize_dimensionwise(z) elif action == 'centeremb': x = embeddings.mean_center_embeddingwise(x) z = embeddings.mean_center_embeddingwise(z) # Step 1: Optimization if args.verbose: print('Beginning Optimization') start_time = time.time() x_count = len(set(src_indices)) z_count = len(set(trg_indices)) A = np.zeros((x_count,z_count)) # Creating dictionary matrix from training set map_dict_src={} map_dict_trg={} I=0 uniq_src=[] uniq_trg=[] for i in range(len(src_indices)): if src_indices[i] not in map_dict_src.keys(): map_dict_src[src_indices[i]]=I I+=1 uniq_src.append(src_indices[i]) J=0 for j in range(len(trg_indices)): if trg_indices[j] not in map_dict_trg.keys(): map_dict_trg[trg_indices[j]]=J J+=1 uniq_trg.append(trg_indices[j]) for i in range(len(src_indices)): A[map_dict_src[src_indices[i]],map_dict_trg[trg_indices[i]]]=1 np.random.seed(0) Lambda=args.l2_reg U1 = TT.matrix() U2 = TT.matrix() B = TT.matrix() Kx, Kz = x[uniq_src], z[uniq_trg] XtAZ = Kx.T.dot(A.dot(Kz)) XtX = Kx.T.dot(Kx) ZtZ = Kz.T.dot(Kz) # AA = np.sum(A*A) # this can be added if cost needs to be compared to original geomm W = (U1.dot(B)).dot(U2.T) regularizer = 0.5*Lambda*(TT.sum(B**2)) sXtX = shared(XtX) sZtZ = shared(ZtZ) sXtAZ = shared(XtAZ) cost = regularizer wtxtxw = W.T.dot(sXtX.dot(W)) wtxtxwztz = wtxtxw.dot(sZtZ) cost += TT.nlinalg.trace(wtxtxwztz) cost += -2 * TT.sum(W * sXtAZ) # cost += shared(AA) # this can be added if cost needs to be compared with original geomm solver = ConjugateGradient(maxtime=args.max_opt_time,maxiter=args.max_opt_iter) manifold =Product([Stiefel(x.shape[1], x.shape[1]),Stiefel(z.shape[1], x.shape[1]),PositiveDefinite(x.shape[1])]) #manifold =Product([Stiefel(x.shape[1], 200),Stiefel(z.shape[1], 200),PositiveDefinite(200)]) problem = Problem(manifold=manifold, cost=cost, arg=[U1,U2,B], verbosity=3) wopt = solver.solve(problem) w= wopt U1 = w[0] U2 = w[1] B = w[2] ### Save the models if requested if args.model_path is not None: os.makedirs(args.model_path,exist_ok=True) np.savetxt('{}/U_src.csv'.format(args.model_path),U1) np.savetxt('{}/U_tgt.csv'.format(args.model_path),U2) np.savetxt('{}/B.csv'.format(args.model_path),B) # Step 2: Transformation xw = x.dot(U1).dot(scipy.linalg.sqrtm(B)) zw = z.dot(U2).dot(scipy.linalg.sqrtm(B)) end_time = time.time() if args.verbose: print('Completed training in {0:.2f} seconds'.format(end_time-start_time)) gc.collect() ### Save the GeoMM embeddings if requested xw_n = embeddings.length_normalize(xw) zw_n = embeddings.length_normalize(zw) if args.geomm_embeddings_path is not None: os.makedirs(args.geomm_embeddings_path,exist_ok=True) out_emb_fname=os.path.join(args.geomm_embeddings_path,'src.vec') with open(out_emb_fname,'w',encoding=args.encoding) as outfile: embeddings.write(src_words,xw_n,outfile) out_emb_fname=os.path.join(args.geomm_embeddings_path,'trg.vec') with open(out_emb_fname,'w',encoding=args.encoding) as outfile: embeddings.write(trg_words,zw_n,outfile) # Step 3: Evaluation if args.normalize_eval: xw = xw_n zw = zw_n X = xw[src_indices] Z = zw[trg_indices] # Loading test dictionary f = open(args.dictionary_test, encoding=args.encoding, errors='surrogateescape') src2trg = collections.defaultdict(set) trg2src = collections.defaultdict(set) oov = set() vocab = set() for line in f: src, trg = line.split() if args.max_vocab: src=src.lower() trg=trg.lower() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src2trg[src_ind].add(trg_ind) trg2src[trg_ind].add(src_ind) vocab.add(src) except KeyError: oov.add(src) src = list(src2trg.keys()) trgt = list(trg2src.keys()) oov -= vocab # If one of the translation options is in the vocabulary, then the entry is not an oov coverage = len(src2trg) / (len(src2trg) + len(oov)) f.close() translation = collections.defaultdict(int) translation5 = collections.defaultdict(list) translation10 = collections.defaultdict(list) ### compute nearest neigbours of x in z t=time.time() nbrhood_x=np.zeros(xw.shape[0]) for i in range(0, len(src), BATCH_SIZE): j = min(i + BATCH_SIZE, len(src)) similarities = xw[src[i:j]].dot(zw.T) similarities_x = -1*np.partition(-1*similarities,args.csls_neighbourhood-1 ,axis=1) nbrhood_x[src[i:j]]=np.mean(similarities_x[:,:args.csls_neighbourhood],axis=1) ### compute nearest neigbours of z in x (GPU version) nbrhood_z=np.zeros(zw.shape[0]) with cp.cuda.Device(0): nbrhood_z2=cp.zeros(zw.shape[0]) batch_num=1 for i in range(0, zw.shape[0], BATCH_SIZE): j = min(i + BATCH_SIZE, zw.shape[0]) similarities = -1*cp.partition(-1*cp.dot(cp.asarray(zw[i:j]),cp.transpose(cp.asarray(xw))),args.csls_neighbourhood-1 ,axis=1)[:,:args.csls_neighbourhood] nbrhood_z2[i:j]=(cp.mean(similarities[:,:args.csls_neighbourhood],axis=1)) batch_num+=1 nbrhood_z=cp.asnumpy(nbrhood_z2) #### compute nearest neigbours of z in x (CPU version) #nbrhood_z=np.zeros(zw.shape[0]) #for i in range(0, len(zw.shape[0]), BATCH_SIZE): # j = min(i + BATCH_SIZE, len(zw.shape[0])) # similarities = zw[i:j].dot(xw.T) # similarities_z = -1*np.partition(-1*similarities,args.csls_neighbourhood-1 ,axis=1) # nbrhood_z[i:j]=np.mean(similarities_z[:,:args.csls_neighbourhood],axis=1) #### find translation #for i in range(0, len(src), BATCH_SIZE): # j = min(i + BATCH_SIZE, len(src)) # similarities = xw[src[i:j]].dot(zw.T) # similarities = np.transpose(np.transpose(2*similarities) - nbrhood_x[src[i:j]]) - nbrhood_z # nn = similarities.argmax(axis=1).tolist() # similarities = np.argsort((similarities),axis=1) # nn5 = (similarities[:,-5:]) # nn10 = (similarities[:,-10:]) # for k in range(j-i): # translation[src[i+k]] = nn[k] # translation5[src[i+k]] = nn5[k] # translation10[src[i+k]] = nn10[k] #if args.geomm_embeddings_path is not None: # delim=',' # os.makedirs(args.geomm_embeddings_path,exist_ok=True) # translations_fname=os.path.join(args.geomm_embeddings_path,'translations.csv') # with open(translations_fname,'w',encoding=args.encoding) as translations_file: # for src_id in src: # src_word = src_words[src_id] # all_trg_words = [ trg_words[trg_id] for trg_id in src2trg[src_id] ] # trgout_words = [ trg_words[j] for j in translation10[src_id] ] # ss = list(nn10[src_id,:]) # # p1 = ':'.join(all_trg_words) # p2 = delim.join( [ '{}{}{}'.format(w,delim,s) for w,s in zip(trgout_words,ss) ] ) # translations_file.write( '{s}{delim}{p1}{delim}{p2}\n'.format(s=src_word, delim=delim, p1=p1, p2=p2) ) ### find translation (and write to file if output requested) delim=',' translations_file =None if args.geomm_embeddings_path is not None: os.makedirs(args.geomm_embeddings_path,exist_ok=True) translations_fname=os.path.join(args.geomm_embeddings_path,'translations.csv') translations_file = open(translations_fname,'w',encoding=args.encoding) for i in range(0, len(src), BATCH_SIZE): j = min(i + BATCH_SIZE, len(src)) similarities = xw[src[i:j]].dot(zw.T) similarities = np.transpose(np.transpose(2*similarities) - nbrhood_x[src[i:j]]) - nbrhood_z nn = similarities.argmax(axis=1).tolist() similarities = np.argsort((similarities),axis=1) nn5 = (similarities[:,-5:]) nn10 = (similarities[:,-10:]) for k in range(j-i): translation[src[i+k]] = nn[k] translation5[src[i+k]] = nn5[k] translation10[src[i+k]] = nn10[k] if args.geomm_embeddings_path is not None: src_id=src[i+k] src_word = src_words[src_id] all_trg_words = [ trg_words[trg_id] for trg_id in src2trg[src_id] ] trgout_words = [ trg_words[j] for j in translation10[src_id] ] #ss = list(nn10[src_id,:]) p1 = ':'.join(all_trg_words) p2 = ':'.join(trgout_words) #p2 = delim.join( [ '{}{}{}'.format(w,delim,s) for w,s in zip(trgout_words,ss) ] ) translations_file.write( '{s}{delim}{p1}{delim}{p2}\n'.format(s=src_word, p1=p1, p2=p2, delim=delim) ) if args.geomm_embeddings_path is not None: translations_file.close() accuracy = np.mean([1 if translation[i] in src2trg[i] else 0 for i in src]) mean=0 for i in src: for k in translation5[i]: if k in src2trg[i]: mean+=1 break mean/=len(src) accuracy5 = mean mean=0 for i in src: for k in translation10[i]: if k in src2trg[i]: mean+=1 break mean/=len(src) accuracy10 = mean message = src_input.split(".")[-2] + "-->" + trg_input.split(".")[-2] + ":" 'Coverage:{0:7.2%} Accuracy:{1:7.2%}'.format(coverage, accuracy)
def main(): # Parse command line arguments parser = argparse.ArgumentParser( description='Map the source embeddings into the target embedding space' ) parser.add_argument('src_input', help='the input source embeddings') parser.add_argument('trg_input', help='the input target embeddings') parser.add_argument('src_output', help='the output source embeddings') parser.add_argument('trg_output', help='the output target embeddings') parser.add_argument( '--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp64', help='the floating-point precision (defaults to fp64)') parser.add_argument('--cuda', action='store_true', help='use cuda (requires cupy)') mapping_group = parser.add_argument_group( 'mapping arguments', 'Basic embedding mapping arguments (EMNLP 2016)') mapping_group.add_argument( '-d', '--dictionary', default=sys.stdin.fileno(), help='the training dictionary file (defaults to stdin)') mapping_group.add_argument( '--normalize', choices=['unit', 'center', 'unitdim', 'centeremb'], nargs='*', default=[], help='the normalization actions to perform in order') mapping_type = mapping_group.add_mutually_exclusive_group() mapping_type.add_argument('-c', '--orthogonal', action='store_true', help='use orthogonal constrained mapping') mapping_type.add_argument('-u', '--unconstrained', action='store_true', help='use unconstrained mapping') self_learning_group = parser.add_argument_group( 'self-learning arguments', 'Optional arguments for self-learning (ACL 2017)') self_learning_group.add_argument('--self_learning', action='store_true', help='enable self-learning') self_learning_group.add_argument( '--direction', choices=['forward', 'backward', 'union'], default='forward', help='the direction for dictionary induction (defaults to forward)') self_learning_group.add_argument( '--numerals', action='store_true', help= 'use latin numerals (i.e. words matching [0-9]+) as the seed dictionary' ) self_learning_group.add_argument( '--threshold', default=0.000001, type=float, help='the convergence threshold (defaults to 0.000001)') self_learning_group.add_argument( '--validation', default=None, help='a dictionary file for validation at each iteration') self_learning_group.add_argument( '--log', help='write to a log file in tsv format at each iteration') self_learning_group.add_argument( '-v', '--verbose', action='store_true', help='write log information to stderr at each iteration') advanced_group = parser.add_argument_group( 'advanced mapping arguments', 'Advanced embedding mapping arguments (AAAI 2018)') advanced_group.add_argument('--whiten', action='store_true', help='whiten the embeddings') advanced_group.add_argument( '--src_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the source language embeddings') advanced_group.add_argument( '--trg_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the target language embeddings') advanced_group.add_argument( '--src_dewhiten', choices=['src', 'trg'], help='de-whiten the source language embeddings') advanced_group.add_argument( '--trg_dewhiten', choices=['src', 'trg'], help='de-whiten the target language embeddings') advanced_group.add_argument('--dim_reduction', type=int, default=0, help='apply dimensionality reduction') args = parser.parse_args() # Check command line arguments if (args.src_dewhiten is not None or args.trg_dewhiten is not None) and not args.whiten: print('ERROR: De-whitening requires whitening first', file=sys.stderr) sys.exit(-1) # Choose the right dtype for the desired precision if args.precision == 'fp16': dtype = 'float16' elif args.precision == 'fp32': dtype = 'float32' elif args.precision == 'fp64': dtype = 'float64' # Read input embeddings srcfile = open(args.src_input, encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_input, encoding=args.encoding, errors='surrogateescape') src_words, x = embeddings.read(srcfile, dtype=dtype) trg_words, z = embeddings.read(trgfile, dtype=dtype) # NumPy/CuPy management if args.cuda: if not supports_cupy(): print('ERROR: Install CuPy for CUDA support', file=sys.stderr) sys.exit(-1) xp = get_cupy() x = xp.asarray(x) z = xp.asarray(z) else: xp = np # Build word to index map src_word2ind = {word: i for i, word in enumerate(src_words)} trg_word2ind = {word: i for i, word in enumerate(trg_words)} # Build training dictionary src_indices = [] trg_indices = [] if args.numerals: if args.dictionary != sys.stdin.fileno(): print('WARNING: Using numerals instead of the training dictionary', file=sys.stderr) numeral_regex = re.compile('^[0-9]+$') src_numerals = { word for word in src_words if numeral_regex.match(word) is not None } trg_numerals = { word for word in trg_words if numeral_regex.match(word) is not None } numerals = src_numerals.intersection(trg_numerals) for word in numerals: src_indices.append(src_word2ind[word]) trg_indices.append(trg_word2ind[word]) else: f = open(args.dictionary, encoding=args.encoding, errors='surrogateescape') for line in f: src, trg = line.split() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src_indices.append(src_ind) trg_indices.append(trg_ind) except KeyError: print('WARNING: OOV dictionary entry ({0} - {1})'.format( src, trg), file=sys.stderr) # Read validation dictionary if args.validation is not None: f = open(args.validation, encoding=args.encoding, errors='surrogateescape') validation = collections.defaultdict(set) oov = set() vocab = set() for line in f: src, trg = line.split() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] validation[src_ind].add(trg_ind) vocab.add(src) except KeyError: oov.add(src) oov -= vocab # If one of the translation options is in the vocabulary, then the entry is not an oov validation_coverage = len(validation) / (len(validation) + len(oov)) # Create log file if args.log: log = open(args.log, mode='w', encoding=args.encoding, errors='surrogateescape') # STEP 0: Normalization for action in args.normalize: if action == 'unit': x = embeddings.length_normalize(x) z = embeddings.length_normalize(z) elif action == 'center': x = embeddings.mean_center(x) z = embeddings.mean_center(z) elif action == 'unitdim': x = embeddings.length_normalize_dimensionwise(x) z = embeddings.length_normalize_dimensionwise(z) elif action == 'centeremb': x = embeddings.mean_center_embeddingwise(x) z = embeddings.mean_center_embeddingwise(z) # Training loop prev_objective = objective = -100. it = 1 t = time.time() while it == 1 or objective - prev_objective >= args.threshold: # Update the embedding mapping if args.orthogonal: # orthogonal mapping u, s, vt = xp.linalg.svd(z[trg_indices].T.dot(x[src_indices])) w = vt.T.dot(u.T) xw = x.dot(w) zw = z elif args.unconstrained: # unconstrained mapping x_pseudoinv = xp.linalg.inv(x[src_indices].T.dot( x[src_indices])).dot(x[src_indices].T) w = x_pseudoinv.dot(z[trg_indices]) xw = x.dot(w) zw = z else: # advanced mapping xw = x zw = z # STEP 1: Whitening def whitening_transformation(m): u, s, vt = xp.linalg.svd(m, full_matrices=False) return vt.T.dot(xp.diag(1 / s)).dot(vt) if args.whiten: wx1 = whitening_transformation(xw[src_indices]) wz1 = whitening_transformation(zw[trg_indices]) xw = xw.dot(wx1) zw = zw.dot(wz1) # STEP 2: Orthogonal mapping wx2, s, wz2_t = xp.linalg.svd(xw[src_indices].T.dot( zw[trg_indices])) wz2 = wz2_t.T xw = xw.dot(wx2) zw = zw.dot(wz2) # STEP 3: Re-weighting xw *= s**args.src_reweight zw *= s**args.trg_reweight # STEP 4: De-whitening if args.src_dewhiten == 'src': xw = xw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2)) elif args.src_dewhiten == 'trg': xw = xw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2)) if args.trg_dewhiten == 'src': zw = zw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2)) elif args.trg_dewhiten == 'trg': zw = zw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2)) # STEP 5: Dimensionality reduction if args.dim_reduction > 0: xw = xw[:, :args.dim_reduction] zw = zw[:, :args.dim_reduction] # Self-learning if args.self_learning: # Update the training dictionary best_sim_forward = xp.full(x.shape[0], -100, dtype=dtype) src_indices_forward = xp.arange(x.shape[0]) trg_indices_forward = xp.zeros(x.shape[0], dtype=int) best_sim_backward = xp.full(z.shape[0], -100, dtype=dtype) src_indices_backward = xp.zeros(z.shape[0], dtype=int) trg_indices_backward = xp.arange(z.shape[0]) for i in range(0, x.shape[0], MAX_DIM_X): j = min(x.shape[0], i + MAX_DIM_X) for k in range(0, z.shape[0], MAX_DIM_Z): l = min(z.shape[0], k + MAX_DIM_Z) sim = xw[i:j].dot(zw[k:l].T) if args.direction in ('forward', 'union'): ind = sim.argmax(axis=1) val = sim[xp.arange(sim.shape[0]), ind] ind += k mask = (val > best_sim_forward[i:j]) best_sim_forward[i:j][mask] = val[mask] trg_indices_forward[i:j][mask] = ind[mask] if args.direction in ('backward', 'union'): ind = sim.argmax(axis=0) val = sim[ind, xp.arange(sim.shape[1])] ind += i mask = (val > best_sim_backward[k:l]) best_sim_backward[k:l][mask] = val[mask] src_indices_backward[k:l][mask] = ind[mask] if args.direction == 'forward': src_indices = src_indices_forward trg_indices = trg_indices_forward elif args.direction == 'backward': src_indices = src_indices_backward trg_indices = trg_indices_backward elif args.direction == 'union': src_indices = xp.concatenate( (src_indices_forward, src_indices_backward)) trg_indices = xp.concatenate( (trg_indices_forward, trg_indices_backward)) # Objective function evaluation prev_objective = objective if args.direction == 'forward': objective = xp.mean(best_sim_forward).tolist() elif args.direction == 'backward': objective = xp.mean(best_sim_backward).tolist() elif args.direction == 'union': objective = (xp.mean(best_sim_forward) + xp.mean(best_sim_backward)).tolist() / 2 # Accuracy and similarity evaluation in validation if args.validation is not None: src = list(validation.keys()) sim = xw[src].dot(zw.T) # TODO Assuming that it fits in memory nn = asnumpy(sim.argmax(axis=1)) accuracy = np.mean([ 1 if nn[i] in validation[src[i]] else 0 for i in range(len(src)) ]) similarity = np.mean([ max([sim[i, j].tolist() for j in validation[src[i]]]) for i in range(len(src)) ]) # Logging duration = time.time() - t if args.verbose: print(file=sys.stderr) print('ITERATION {0} ({1:.2f}s)'.format(it, duration), file=sys.stderr) print('\t- Objective: {0:9.4f}%'.format(100 * objective), file=sys.stderr) if args.validation is not None: print('\t- Val. similarity: {0:9.4f}%'.format(100 * similarity), file=sys.stderr) print('\t- Val. accuracy: {0:9.4f}%'.format(100 * accuracy), file=sys.stderr) print('\t- Val. coverage: {0:9.4f}%'.format( 100 * validation_coverage), file=sys.stderr) sys.stderr.flush() if args.log is not None: val = '{0:.6f}\t{1:.6f}\t{2:.6f}'.format( 100 * similarity, 100 * accuracy, 100 * validation_coverage) if args.validation is not None else '' print('{0}\t{1:.6f}\t{2}\t{3:.6f}'.format( it, 100 * objective, val, duration), file=log) log.flush() t = time.time() it += 1 # Write mapped embeddings srcfile = open(args.src_output, mode='w', encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_output, mode='w', encoding=args.encoding, errors='surrogateescape') embeddings.write(src_words, xw, srcfile) embeddings.write(trg_words, zw, trgfile) srcfile.close() trgfile.close()
def add_oov_embeddings(train_dict_fname, test_dict_fname, src_emb_fname, tgt_emb_fname, out_src_emb_fname, out_tgt_emb_fname, src_model_path, tgt_model_path, fast_text_binary_path, max_voc=200000, emb_format='txt'): """ Adds the embeddings for OOV words in the training and test dictionaries to the embedding file. This is done by computing the embeddings using FastText. So, this method applies to FastText embeddings only. Note that the output embedding file will contain only the OOV words plus the first max_voc words in the original embedding file. train_dict_fname: test_dict_fname: src_emb_fname: embedding file for source language tgt_emb_fname: embedding file for target language out_src_emb_fname: output embedding file for source language out_tgt_emb_fname: output embedding file for target language src_model_path: fasttext model for source language tgt_model_path: fasttext model for targetqa language fast_text_binary_path: path to fasttext binary max_voc: number of vocab items to process from the embedding file emb_format: format of embedding files. Currently supported: 'txt' - standard fast text format """ ## read dictionaries train_dict = read_dict(train_dict_fname) test_dict = read_dict(test_dict_fname) # read embeddings src_vcb_words = None src_emb = None tgt_vcb_words = None tgt_emb = None with open(src_emb_fname, 'r', encoding='utf-8' ) as src_emb_file, \ open(tgt_emb_fname, 'r', encoding='utf-8' ) as tgt_emb_file: src_vcb_words, src_emb = embeddings.read(src_emb_file, max_voc) tgt_vcb_words, tgt_emb = embeddings.read(tgt_emb_file, max_voc) ## find OOVs src_oov_words = set() src_oov_words.update(train_dict.keys()) src_oov_words.update(test_dict.keys()) src_oov_words.difference_update(src_vcb_words) print('Number of src OOV words: {}'.format(len(src_oov_words))) tgt_oov_words = set() tgt_oov_words.update(train_dict.values()) tgt_oov_words.update(test_dict.values()) tgt_oov_words.difference_update(tgt_vcb_words) print('Number of tgt OOV words: {}'.format(len(tgt_oov_words))) ## compute embeddings for OOV ##### cat queries.txt | ./fasttext print-word-vectors model.bin src_oov_final_words, src_oov_emb = compute_fasttext_embeddings( src_oov_words, src_model_path, fast_text_binary_path) tgt_oov_final_words, tgt_oov_emb = compute_fasttext_embeddings( tgt_oov_words, tgt_model_path, fast_text_binary_path) if (len(src_oov_words) != len(src_oov_final_words)): print( 'WARNING: Embeddings not computed for {} words out of {} OOV source words' .format( len(src_oov_words) - len(src_oov_final_words), len(src_oov_words))) if (len(tgt_oov_words) != len(tgt_oov_final_words)): print( 'WARNING: Embeddings not computed for {} words out of {} OOV target words' .format( len(tgt_oov_words) - len(tgt_oov_final_words), len(tgt_oov_words))) ## write new embeddings files to disk ## put the OOV words first followed by words in the original embeddings file with open(out_src_emb_fname, 'w', encoding='utf-8' ) as out_src_emb_file, \ open(out_tgt_emb_fname, 'w', encoding='utf-8' ) as out_tgt_emb_file: embeddings.write(src_oov_final_words + src_vcb_words, np.concatenate([src_oov_emb, src_emb]), out_src_emb_file) embeddings.write(tgt_oov_final_words + tgt_vcb_words, np.concatenate([tgt_oov_emb, tgt_emb]), out_tgt_emb_file)
def main(): # Parse command line arguments parser = argparse.ArgumentParser(description='Map word embeddings in two languages into a shared space') parser.add_argument('src_input', help='the input source embeddings') parser.add_argument('trg_input', help='the input target embeddings') parser.add_argument('src_output', help='the output source embeddings') parser.add_argument('trg_output', help='the output target embeddings') parser.add_argument('--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp32', help='the floating-point precision (defaults to fp32)') parser.add_argument('--batch_size', default=10000, type=int, help='batch size (defaults to 10000); does not affect results, larger is usually faster but uses more memory') parser.add_argument('--seed', type=int, default=0, help='the random seed (defaults to 0)') recommended_group = parser.add_argument_group('recommended settings', 'Recommended settings for different scenarios') recommended_type = recommended_group.add_mutually_exclusive_group() recommended_type.add_argument('--supervised', metavar='DICTIONARY', help='recommended if you have a large training dictionary') recommended_type.add_argument('--semi_supervised', metavar='DICTIONARY', help='recommended if you have a small seed dictionary') recommended_type.add_argument('--identical', action='store_true', help='recommended if you have no seed dictionary but can rely on identical words') recommended_type.add_argument('--unsupervised', action='store_true', help='recommended if you have no seed dictionary and do not want to rely on identical words') init_group = parser.add_argument_group('advanced initialization arguments', 'Advanced initialization arguments') init_type = init_group.add_mutually_exclusive_group() init_type.add_argument('-d', '--init_dictionary', default=sys.stdin.fileno(), metavar='DICTIONARY', help='the training dictionary file (defaults to stdin)') init_type.add_argument('--init_identical', action='store_true', help='use identical words as the seed dictionary') init_type.add_argument('--init_numerals', action='store_true', help='use latin numerals (i.e. words matching [0-9]+) as the seed dictionary') init_type.add_argument('--init_unsupervised', action='store_true', help='use unsupervised initialization') init_group.add_argument('--unsupervised_vocab', type=int, default=0, help='restrict the vocabulary to the top k entries for unsupervised initialization') mapping_group = parser.add_argument_group('advanced mapping arguments', 'Advanced embedding mapping arguments') mapping_group.add_argument('--normalize', choices=['unit', 'center', 'unitdim', 'centeremb', 'none'], nargs='*', default=[], help='the normalization actions to perform in order') mapping_group.add_argument('--whiten', action='store_true', help='whiten the embeddings') mapping_group.add_argument('--src_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the source language embeddings') mapping_group.add_argument('--trg_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the target language embeddings') mapping_group.add_argument('--src_dewhiten', choices=['src', 'trg'], help='de-whiten the source language embeddings') mapping_group.add_argument('--trg_dewhiten', choices=['src', 'trg'], help='de-whiten the target language embeddings') mapping_group.add_argument('--dim_reduction', type=int, default=0, help='apply dimensionality reduction') mapping_type = mapping_group.add_mutually_exclusive_group() mapping_type.add_argument('-c', '--orthogonal', action='store_true', help='use orthogonal constrained mapping') mapping_type.add_argument('-u', '--unconstrained', action='store_true', help='use unconstrained mapping') self_learning_group = parser.add_argument_group('advanced self-learning arguments', 'Advanced arguments for self-learning') self_learning_group.add_argument('--self_learning', action='store_true', help='enable self-learning') self_learning_group.add_argument('--vocabulary_cutoff', type=int, default=0, help='restrict the vocabulary to the top k entries') self_learning_group.add_argument('--direction', choices=['forward', 'backward', 'union'], default='union', help='the direction for dictionary induction (defaults to union)') self_learning_group.add_argument('--csls', type=int, nargs='?', default=0, const=10, metavar='NEIGHBORHOOD_SIZE', dest='csls_neighborhood', help='use CSLS for dictionary induction') self_learning_group.add_argument('--threshold', default=0.000001, type=float, help='the convergence threshold (defaults to 0.000001)') self_learning_group.add_argument('--validation', default=None, metavar='DICTIONARY', help='a dictionary file for validation at each iteration') self_learning_group.add_argument('--stochastic_initial', default=0.1, type=float, help='initial keep probability stochastic dictionary induction (defaults to 0.1)') self_learning_group.add_argument('--stochastic_multiplier', default=2.0, type=float, help='stochastic dictionary induction multiplier (defaults to 2.0)') self_learning_group.add_argument('--stochastic_interval', default=50, type=int, help='stochastic dictionary induction interval (defaults to 50)') self_learning_group.add_argument('--log', help='write to a log file in tsv format at each iteration') self_learning_group.add_argument('-v', '--verbose', action='store_true', help='write log information to stderr at each iteration') args = parser.parse_args() if args.supervised is not None: parser.set_defaults(init_dictionary=args.supervised, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', batch_size=1000) if args.semi_supervised is not None: parser.set_defaults(init_dictionary=args.semi_supervised, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10) if args.identical: parser.set_defaults(init_identical=True, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10) if args.unsupervised: parser.set_defaults(init_unsupervised=True, unsupervised_vocab=4000, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10) args = parser.parse_args() # Check command line arguments if (args.src_dewhiten is not None or args.trg_dewhiten is not None) and not args.whiten: print('ERROR: De-whitening requires whitening first', file=sys.stderr) sys.exit(-1) # Choose the right dtype for the desired precision if args.precision == 'fp16': dtype = 'float16' elif args.precision == 'fp32': dtype = 'float32' elif args.precision == 'fp64': dtype = 'float64' # Read input embeddings srcfile = open(args.src_input, encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_input, encoding=args.encoding, errors='surrogateescape') src_words, x = embeddings.read(srcfile, dtype=dtype) trg_words, z = embeddings.read(trgfile, dtype=dtype) np.random.seed(args.seed) # Build word to index map src_word2ind = {word: i for i, word in enumerate(src_words)} trg_word2ind = {word: i for i, word in enumerate(trg_words)} # STEP 0: Normalization embeddings.normalize(x, args.normalize) embeddings.normalize(z, args.normalize) # Build the seed dictionary src_indices = [] trg_indices = [] if args.init_unsupervised: sim_size = min(x.shape[0], z.shape[0]) if args.unsupervised_vocab <= 0 else min(x.shape[0], z.shape[0], args.unsupervised_vocab) u, s, vt = np.linalg.svd(x[:sim_size], full_matrices=False) xsim = (u*s).dot(u.T) u, s, vt = np.linalg.svd(z[:sim_size], full_matrices=False) zsim = (u*s).dot(u.T) del u, s, vt xsim.sort(axis=1) zsim.sort(axis=1) embeddings.normalize(xsim, args.normalize) embeddings.normalize(zsim, args.normalize) sim = xsim.dot(zsim.T) if args.csls_neighborhood > 0: knn_sim_fwd = topk_mean(sim, k=args.csls_neighborhood) knn_sim_bwd = topk_mean(sim.T, k=args.csls_neighborhood) sim -= knn_sim_fwd[:, np.newaxis]/2 + knn_sim_bwd/2 if args.direction == 'forward': src_indices = np.arange(sim_size) trg_indices = sim.argmax(axis=1) elif args.direction == 'backward': src_indices = sim.argmax(axis=0) trg_indices = np.arange(sim_size) elif args.direction == 'union': src_indices = np.concatenate((np.arange(sim_size), sim.argmax(axis=0))) trg_indices = np.concatenate((sim.argmax(axis=1), np.arange(sim_size))) del xsim, zsim, sim elif args.init_numerals: numeral_regex = re.compile('^[0-9]+$') src_numerals = {word for word in src_words if numeral_regex.match(word) is not None} trg_numerals = {word for word in trg_words if numeral_regex.match(word) is not None} numerals = src_numerals.intersection(trg_numerals) for word in numerals: src_indices.append(src_word2ind[word]) trg_indices.append(trg_word2ind[word]) elif args.init_identical: identical = set(src_words).intersection(set(trg_words)) for word in identical: src_indices.append(src_word2ind[word]) trg_indices.append(trg_word2ind[word]) else: f = open(args.init_dictionary, encoding=args.encoding, errors='surrogateescape') for line in f: src, trg = line.split() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src_indices.append(src_ind) trg_indices.append(trg_ind) except KeyError: print('WARNING: OOV dictionary entry ({0} - {1})'.format(src, trg), file=sys.stderr) # Read validation dictionary if args.validation is not None: f = open(args.validation, encoding=args.encoding, errors='surrogateescape') validation = collections.defaultdict(set) oov = set() vocab = set() for line in f: src, trg = line.split() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] validation[src_ind].add(trg_ind) vocab.add(src) except KeyError: oov.add(src) oov -= vocab # If one of the translation options is in the vocabulary, then the entry is not an oov validation_coverage = len(validation) / (len(validation) + len(oov)) # Create log file if args.log: log = open(args.log, mode='w', encoding=args.encoding, errors='surrogateescape') # Allocate memory xw = np.empty_like(x) zw = np.empty_like(z) src_size = x.shape[0] if args.vocabulary_cutoff <= 0 else min(x.shape[0], args.vocabulary_cutoff) trg_size = z.shape[0] if args.vocabulary_cutoff <= 0 else min(z.shape[0], args.vocabulary_cutoff) simfwd = np.empty((args.batch_size, trg_size), dtype=dtype) simbwd = np.empty((args.batch_size, src_size), dtype=dtype) if args.validation is not None: simval = np.empty((len(validation.keys()), z.shape[0]), dtype=dtype) best_sim_forward = np.full(src_size, -100, dtype=dtype) src_indices_forward = np.arange(src_size) trg_indices_forward = np.zeros(src_size, dtype=int) best_sim_backward = np.full(trg_size, -100, dtype=dtype) src_indices_backward = np.zeros(trg_size, dtype=int) trg_indices_backward = np.arange(trg_size) knn_sim_fwd = np.zeros(src_size, dtype=dtype) knn_sim_bwd = np.zeros(trg_size, dtype=dtype) # Training loop best_objective = objective = -100. it = 1 last_improvement = 0 keep_prob = args.stochastic_initial t = time.time() end = not args.self_learning while True: # Increase the keep probability if we have not improve in args.stochastic_interval iterations if it - last_improvement > args.stochastic_interval: if keep_prob >= 1.0: end = True keep_prob = min(1.0, args.stochastic_multiplier*keep_prob) last_improvement = it # Update the embedding mapping if args.orthogonal or not end: # orthogonal mapping u, s, vt = np.linalg.svd(z[trg_indices].T.dot(x[src_indices])) w = vt.T.dot(u.T) x.dot(w, out=xw) zw[:] = z elif args.unconstrained: # unconstrained mapping x_pseudoinv = np.linalg.inv(x[src_indices].T.dot(x[src_indices])).dot(x[src_indices].T) w = x_pseudoinv.dot(z[trg_indices]) x.dot(w, out=xw) zw[:] = z else: # advanced mapping # TODO xw.dot(wx2, out=xw) and alike not working xw[:] = x zw[:] = z # STEP 1: Whitening def whitening_transformation(m): u, s, vt = np.linalg.svd(m, full_matrices=False) return vt.T.dot(np.diag(1/s)).dot(vt) if args.whiten: wx1 = whitening_transformation(xw[src_indices]) wz1 = whitening_transformation(zw[trg_indices]) xw = xw.dot(wx1) zw = zw.dot(wz1) # STEP 2: Orthogonal mapping wx2, s, wz2_t = np.linalg.svd(xw[src_indices].T.dot(zw[trg_indices])) wz2 = wz2_t.T xw = xw.dot(wx2) zw = zw.dot(wz2) # STEP 3: Re-weighting xw *= s**args.src_reweight zw *= s**args.trg_reweight # STEP 4: De-whitening if args.src_dewhiten == 'src': xw = xw.dot(wx2.T.dot(np.linalg.inv(wx1)).dot(wx2)) elif args.src_dewhiten == 'trg': xw = xw.dot(wz2.T.dot(np.linalg.inv(wz1)).dot(wz2)) if args.trg_dewhiten == 'src': zw = zw.dot(wx2.T.dot(np.linalg.inv(wx1)).dot(wx2)) elif args.trg_dewhiten == 'trg': zw = zw.dot(wz2.T.dot(np.linalg.inv(wz1)).dot(wz2)) # STEP 5: Dimensionality reduction if args.dim_reduction > 0: xw = xw[:, :args.dim_reduction] zw = zw[:, :args.dim_reduction] # Self-learning if end: break else: # Update the training dictionary if args.direction in ('forward', 'union'): if args.csls_neighborhood > 0: for i in range(0, trg_size, simbwd.shape[0]): j = min(i + simbwd.shape[0], trg_size) zw[i:j].dot(xw[:src_size].T, out=simbwd[:j-i]) knn_sim_bwd[i:j] = topk_mean(simbwd[:j-i], k=args.csls_neighborhood, inplace=True) for i in range(0, src_size, simfwd.shape[0]): j = min(i + simfwd.shape[0], src_size) xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j-i]) simfwd[:j-i].max(axis=1, out=best_sim_forward[i:j]) simfwd[:j-i] -= knn_sim_bwd/2 # Equivalent to the real CSLS scores for NN dropout(simfwd[:j-i], 1 - keep_prob).argmax(axis=1, out=trg_indices_forward[i:j]) if args.direction in ('backward', 'union'): if args.csls_neighborhood > 0: for i in range(0, src_size, simfwd.shape[0]): j = min(i + simfwd.shape[0], src_size) xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j-i]) knn_sim_fwd[i:j] = topk_mean(simfwd[:j-i], k=args.csls_neighborhood, inplace=True) for i in range(0, trg_size, simbwd.shape[0]): j = min(i + simbwd.shape[0], trg_size) zw[i:j].dot(xw[:src_size].T, out=simbwd[:j-i]) simbwd[:j-i].max(axis=1, out=best_sim_backward[i:j]) simbwd[:j-i] -= knn_sim_fwd/2 # Equivalent to the real CSLS scores for NN dropout(simbwd[:j-i], 1 - keep_prob).argmax(axis=1, out=src_indices_backward[i:j]) if args.direction == 'forward': src_indices = src_indices_forward trg_indices = trg_indices_forward elif args.direction == 'backward': src_indices = src_indices_backward trg_indices = trg_indices_backward elif args.direction == 'union': src_indices = np.concatenate((src_indices_forward, src_indices_backward)) trg_indices = np.concatenate((trg_indices_forward, trg_indices_backward)) # Objective function evaluation if args.direction == 'forward': objective = np.mean(best_sim_forward).tolist() elif args.direction == 'backward': objective = np.mean(best_sim_backward).tolist() elif args.direction == 'union': objective = (np.mean(best_sim_forward) + np.mean(best_sim_backward)).tolist() / 2 if objective - best_objective >= args.threshold: last_improvement = it best_objective = objective # Accuracy and similarity evaluation in validation if args.validation is not None: src = list(validation.keys()) xw[src].dot(zw.T, out=simval) nn = asnumpy(simval.argmax(axis=1)) accuracy = np.mean([1 if nn[i] in validation[src[i]] else 0 for i in range(len(src))]) similarity = np.mean([max([simval[i, j].tolist() for j in validation[src[i]]]) for i in range(len(src))]) # Logging duration = time.time() - t if args.verbose: print(file=sys.stderr) print('ITERATION {0} ({1:.2f}s)'.format(it, duration), file=sys.stderr) print('\t- Objective: {0:9.4f}%'.format(100 * objective), file=sys.stderr) print('\t- Drop probability: {0:9.4f}%'.format(100 - 100*keep_prob), file=sys.stderr) if args.validation is not None: print('\t- Val. similarity: {0:9.4f}%'.format(100 * similarity), file=sys.stderr) print('\t- Val. accuracy: {0:9.4f}%'.format(100 * accuracy), file=sys.stderr) print('\t- Val. coverage: {0:9.4f}%'.format(100 * validation_coverage), file=sys.stderr) sys.stderr.flush() if args.log is not None: val = '{0:.6f}\t{1:.6f}\t{2:.6f}'.format( 100 * similarity, 100 * accuracy, 100 * validation_coverage) if args.validation is not None else '' print('{0}\t{1:.6f}\t{2}\t{3:.6f}'.format(it, 100 * objective, val, duration), file=log) log.flush() t = time.time() it += 1 # Write mapped embeddings srcfile = open(args.src_output, mode='w', encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_output, mode='w', encoding=args.encoding, errors='surrogateescape') embeddings.write(src_words, xw, srcfile) embeddings.write(trg_words, zw, trgfile) srcfile.close() trgfile.close()
def share_embedding(words, matrix): f = open('D:/TestData/predata/miandian_Position_50.txt', mode='w', encoding='utf-8', errors='surrogateescape') embeddings.write(words, matrix, f)
def main(): # Parse command line arguments parser = argparse.ArgumentParser(description='Generate meta embeddings') parser.add_argument('emb1', help='path to embedding 1') parser.add_argument('emb2', help='path to embedding 2') parser.add_argument('--method', choices=['avg', 'conc'], default=['avg'], type=str, nargs=1, help='meta embedding generation method') parser.add_argument('--meta_embeddings_path', default='./', type=str, help='directory to save the output meta embeddings') parser.add_argument( '--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') parser.add_argument('--verbose', default=0, type=int, help='Verbose') parser.add_argument( '--normalize', choices=['unit', 'center', 'unitdim', 'centeremb', 'no'], nargs=2, default=[], help= 'the normalization actions performed in sequence for embeddings 1 and 2' ) args = parser.parse_args() if args.verbose: print('Current arguments: {0}'.format(args)) dtype = 'float32' if args.verbose: print('Loading embeddings data...') emb1file = open(args.emb1, encoding=args.encoding, errors='surrogateescape') emb2file = open(args.emb2, encoding=args.encoding, errors='surrogateescape') emb1_words, x = embeddings.read(emb1file, max_voc=0, dtype=dtype) emb2_words, z = embeddings.read(emb2file, max_voc=0, dtype=dtype) if len(args.normalize) > 0: x = normalize_emb(x, args.normalize[0]) z = normalize_emb(z, args.normalize[1]) emb1 = Embedding(emb1_words, x) emb2 = Embedding(emb2_words, z) if args.method[0] == "avg": meta_emb = avg(emb1, emb2) elif args.method[0] == "conc": meta_emb = concatenate(emb1, emb2) del emb1, emb2 gc.collect() meta_emb_words = [] meta_emb_vecs = [] for w, v in meta_emb.word_vec_map.items(): meta_emb_words += [w] meta_emb_vecs += [v] del meta_emb gc.collect() out_emb_fname = os.path.join(args.meta_embeddings_path, 'meta_emb.vec') with open(out_emb_fname, 'w', encoding=args.encoding) as outfile: embeddings.write(meta_emb_words, meta_emb_vecs, outfile)
def main(): # Parse command line arguments parser = argparse.ArgumentParser( description='Map the source embeddings into the target embedding space' ) parser.add_argument('emb_file', help='the input target embeddings') parser.add_argument( '--lang_list', default='', help= 'the list of languages listed in the same order as in the input embedding `emb_file` (comma-separated). e.g. "en,es,fr"' ) parser.add_argument( '--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') parser.add_argument('--model_path', default=None, type=str, help='directory to save the model') parser.add_argument( '--geomm_embeddings_path', default=None, type=str, help= 'directory to save the output GeoMM Multi latent space embeddings. The output embeddings are normalized.' ) parser.add_argument( '--max_vocab', default=0, type=int, help='Maximum vocabulary to be loaded, 0 allows complete vocabulary') parser.add_argument('--verbose', default=0, type=int, help='Verbose') mapping_group = parser.add_argument_group( 'mapping arguments', 'Basic embedding mapping arguments') mapping_group.add_argument( '-dtrain_file', '--dictionary_train_file', default=sys.stdin.fileno(), help='the training dictionary file (defaults to stdin)') mapping_group.add_argument( '-dtest_file', '--dictionary_test_file', default=sys.stdin.fileno(), help='the test dictionary file (defaults to stdin)') mapping_group.add_argument( '--normalize', choices=['unit', 'center', 'unitdim', 'centeremb'], nargs='*', default=[], help='the normalization actions to perform in order') geomm_group = parser.add_argument_group( 'GeoMM Multi arguments', 'Arguments for GeoMM Multi method') geomm_group.add_argument('--l2_reg', type=float, default=1e3, help='Lambda for L2 Regularization') geomm_group.add_argument( '--max_opt_time', type=int, default=5000, help='Maximum time limit for optimization in seconds') geomm_group.add_argument( '--max_opt_iter', type=int, default=150, help='Maximum number of iterations for optimization') eval_group = parser.add_argument_group('evaluation arguments', 'Arguments for evaluation') eval_group.add_argument('--normalize_eval', action='store_true', help='Normalize the embeddings at test time') eval_group.add_argument('--eval_batch_size', type=int, default=1000, help='Batch size for evaluation') eval_group.add_argument('--csls_neighbourhood', type=int, default=10, help='Neighbourhood size for CSLS') args = parser.parse_args() BATCH_SIZE = args.eval_batch_size lang_list = None ## Logging #method_name = os.path.join('logs','geomm_multi') #directory = os.path.join(os.path.join(os.getcwd(),method_name), datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) #if not os.path.exists(directory): # os.makedirs(directory) #log_file_name, file_extension = os.path.splitext(os.path.basename(args.dictionary_train_file)) #log_file_name = log_file_name + '.log' #class Logger(object): # def __init__(self): # self.terminal = sys.stdout # self.log = open(os.path.join(directory,log_file_name), "a") # def write(self, message): # self.terminal.write(message) # self.log.write(message) # def flush(self): # #this flush method is needed for python 3 compatibility. # #this handles the flush command by doing nothing. # #you might want to specify some extra behavior here. # pass #sys.stdout = Logger() if args.verbose: print('Current arguments: {0}'.format(args)) dtype = 'float32' if args.verbose: print('Loading train data...') words = [] emb = [] with open(args.emb_file, encoding=args.encoding, errors='surrogateescape') as f: for line in f: srcfile = open(line.strip(), encoding=args.encoding, errors='surrogateescape') words_temp, x_temp = embeddings.read(srcfile, max_voc=args.max_vocab, dtype=dtype) words.append(words_temp) emb.append(x_temp) # Build word to index map word2ind = [] for lang in words: word2ind.append({word: i for i, word in enumerate(lang)}) ##### Set language names ## language id map if args.lang_list == '': lang_list = [str(i) for i in range(len(emb))] else: lang_list = args.lang_list.split(',') # Build training dictionary train_pairs = [] with open(args.dictionary_train_file, encoding=args.encoding, errors='surrogateescape') as ff: for line in ff: vals = line.split(',') curr_dict = [int(vals[0].strip()), int(vals[1].strip())] src_indices = [] trg_indices = [] with open(vals[2].strip(), encoding=args.encoding, errors='surrogateescape') as f: for line in f: src, trg = line.split() if args.max_vocab: src = src.lower() trg = trg.lower() try: src_ind = word2ind[curr_dict[0]][src] trg_ind = word2ind[curr_dict[1]][trg] src_indices.append(src_ind) trg_indices.append(trg_ind) except KeyError: if args.verbose: print('WARNING: OOV dictionary entry ({0} - {1})'. format(src, trg), file=sys.stderr) curr_dict.append(src_indices) curr_dict.append(trg_indices) train_pairs.append(curr_dict) if args.verbose: print('Normalizing embeddings...') # Step 0: Normalization for action in args.normalize: if action == 'unit': for i in range(len(emb)): emb[i] = embeddings.length_normalize(emb[i]) elif action == 'center': for i in range(len(emb)): emb[i] = embeddings.mean_center(emb[i]) elif action == 'unitdim': for i in range(len(emb)): emb[i] = embeddings.length_normalize_dimensionwise(emb[i]) elif action == 'centeremb': for i in range(len(emb)): emb[i] = embeddings.mean_center_embeddingwise(emb[i]) # Step 1: Optimization if args.verbose: print('Beginning Optimization') start_time = time.time() mean_size = 0 for tp in range(len(train_pairs)): src_indices = train_pairs[tp][2] trg_indices = train_pairs[tp][3] x_count = len(set(src_indices)) z_count = len(set(trg_indices)) A = np.zeros((x_count, z_count)) # Creating dictionary matrix from training set map_dict_src = {} map_dict_trg = {} I = 0 uniq_src = [] uniq_trg = [] for i in range(len(src_indices)): if src_indices[i] not in map_dict_src.keys(): map_dict_src[src_indices[i]] = I I += 1 uniq_src.append(src_indices[i]) J = 0 for j in range(len(trg_indices)): if trg_indices[j] not in map_dict_trg.keys(): map_dict_trg[trg_indices[j]] = J J += 1 uniq_trg.append(trg_indices[j]) for i in range(len(src_indices)): A[map_dict_src[src_indices[i]], map_dict_trg[trg_indices[i]]] = 1 train_pairs[tp].append(uniq_src) train_pairs[tp].append(uniq_trg) train_pairs[tp].append(A) mean_size += (len(uniq_src) * len(uniq_trg)) mean_size = mean_size / len(train_pairs) np.random.seed(0) Lambda = args.l2_reg variables = [] manif = [] low_rank = emb[0].shape[1] for i in range(len(emb)): variables.append(TT.matrix()) manif.append(Stiefel(emb[i].shape[1], low_rank)) variables.append(TT.matrix()) manif.append(PositiveDefinite(low_rank)) B = variables[-1] cost = 0.5 * Lambda * (TT.sum(B**2)) for i in range(len(train_pairs)): x = emb[train_pairs[i][0]] z = emb[train_pairs[i][1]] U1 = variables[train_pairs[i][0]] U2 = variables[train_pairs[i][1]] cost = cost + TT.sum( ((shared(x[train_pairs[i][4]]).dot(U1.dot(B.dot(U2.T)))).dot( shared(z[train_pairs[i][5]]).T) - shared(train_pairs[i][6]))** 2) / float(len(train_pairs[i][2])) solver = ConjugateGradient(maxtime=args.max_opt_time, maxiter=args.max_opt_iter, mingradnorm=1e-12) manifold = Product(manif) problem = Problem(manifold=manifold, cost=cost, arg=variables, verbosity=3) wopt = solver.solve(problem) w = wopt ### Save the models if requested if args.model_path is not None: os.makedirs(args.model_path, exist_ok=True) for i in range(len(emb)): np.savetxt('{0}/U_{1}.csv'.format(args.model_path, lang_list[i]), wopt[i]) np.savetxt('{}/B.csv'.format(args.model_path), wopt[-1]) #with open('{}/lang_id_map.txt'.format(args.model_path),'w',encoding='utf-8') as idmapfile: # for lang in lang_list: # idmapfile.write(lang+'\n') # Step 2: Transformation Bhalf = scipy.linalg.sqrtm(wopt[-1]) test_emb = [] for i in range(len(emb)): test_emb.append(emb[i].dot(wopt[i]).dot(Bhalf)) end_time = time.time() if args.verbose: print('Completed training in {0:.2f} seconds'.format(end_time - start_time)) gc.collect() ### Save the GeoMM embeddings if requested if args.geomm_embeddings_path is not None: os.makedirs(args.geomm_embeddings_path, exist_ok=True) for i in range(len(test_emb)): out_emb_fname = os.path.join(args.geomm_embeddings_path, 'emb_{0}.vec'.format(lang_list[i])) with open(out_emb_fname, 'w', encoding=args.encoding) as outfile: embeddings.write(words[i], embeddings.length_normalize(test_emb[i]), outfile) # Step 3: Evaluation if args.verbose: print('Beginning Evaluation') if args.normalize_eval: for i in range(len(test_emb)): test_emb[i] = embeddings.length_normalize(test_emb[i]) # Loading test dictionary with open(args.dictionary_test_file, encoding=args.encoding, errors='surrogateescape') as ff: for line in ff: vals = line.split(',') curr_dict = [int(vals[0].strip()), int(vals[1].strip())] with open(vals[2].strip(), encoding=args.encoding, errors='surrogateescape') as f: src_word2ind = word2ind[curr_dict[0]] trg_word2ind = word2ind[curr_dict[1]] xw = test_emb[curr_dict[0]] zw = test_emb[curr_dict[1]] src2trg = collections.defaultdict(set) trg2src = collections.defaultdict(set) oov = set() vocab = set() for line in f: src, trg = line.split() if args.max_vocab: src = src.lower() trg = trg.lower() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src2trg[src_ind].add(trg_ind) trg2src[trg_ind].add(src_ind) vocab.add(src) except KeyError: oov.add(src) src = list(src2trg.keys()) trgt = list(trg2src.keys()) oov -= vocab # If one of the translation options is in the vocabulary, then the entry is not an oov coverage = len(src2trg) / (len(src2trg) + len(oov)) f.close() translation = collections.defaultdict(int) translation5 = collections.defaultdict(list) translation10 = collections.defaultdict(list) t = time.time() nbrhood_x = np.zeros(xw.shape[0]) nbrhood_z = np.zeros(zw.shape[0]) nbrhood_z2 = cp.zeros(zw.shape[0]) for i in range(0, len(src), BATCH_SIZE): j = min(i + BATCH_SIZE, len(src)) similarities = xw[src[i:j]].dot(zw.T) similarities_x = -1 * np.partition( -1 * similarities, args.csls_neighbourhood - 1, axis=1) nbrhood_x[src[i:j]] = np.mean( similarities_x[:, :args.csls_neighbourhood], axis=1) batch_num = 1 with cp.cuda.Device(1): for i in range(0, zw.shape[0], BATCH_SIZE): j = min(i + BATCH_SIZE, zw.shape[0]) similarities = -1 * cp.partition( -1 * cp.dot(cp.asarray(zw[i:j]), cp.transpose(cp.asarray(xw))), args.csls_neighbourhood - 1, axis=1)[:, :args.csls_neighbourhood] nbrhood_z2[i:j] = (cp.mean( similarities[:, :args.csls_neighbourhood], axis=1)) batch_num += 1 nbrhood_z = cp.asnumpy(nbrhood_z2) for i in range(0, len(src), BATCH_SIZE): j = min(i + BATCH_SIZE, len(src)) similarities = xw[src[i:j]].dot(zw.T) similarities = np.transpose( np.transpose(2 * similarities) - nbrhood_x[src[i:j]]) - nbrhood_z nn = similarities.argmax(axis=1).tolist() similarities = np.argsort((similarities), axis=1) nn5 = (similarities[:, -5:]) nn10 = (similarities[:, -10:]) for k in range(j - i): translation[src[i + k]] = nn[k] translation5[src[i + k]] = nn5[k] translation10[src[i + k]] = nn10[k] accuracy = np.mean( [1 if translation[i] in src2trg[i] else 0 for i in src]) mean = 0 for i in src: for k in translation5[i]: if k in src2trg[i]: mean += 1 break mean /= len(src) accuracy5 = mean mean = 0 for i in src: for k in translation10[i]: if k in src2trg[i]: mean += 1 break mean /= len(src) accuracy10 = mean print( 'Coverage:{0:7.2%} Accuracy:{1:7.2%} Accuracy(Top 5):{2:7.2%} Accuracy(Top 10):{3:7.2%}' .format(coverage, accuracy, accuracy5, accuracy10))
def main(): # Parse command line arguments parser = argparse.ArgumentParser( description='Map word embeddings in two languages into a shared space') parser.add_argument('src_input', help='the input source embeddings') parser.add_argument('trg_input', help='the input target embeddings') parser.add_argument('src_output', help='the output source embeddings') parser.add_argument('trg_output', help='the output target embeddings') parser.add_argument( '--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp32', help='the floating-point precision (defaults to fp32)') parser.add_argument('--cuda', action='store_true', help='use cuda (requires cupy)') parser.add_argument( '--batch_size', default=10000, type=int, help= 'batch size (defaults to 10000); does not affect results, larger is usually faster but uses more memory' ) parser.add_argument('--seed', type=int, default=0, help='the random seed (defaults to 0)') parser.add_argument('--maxiter', type=int, default=10, help='max number of iterations') parser.add_argument('--corekbest', type=int, default=2, help='nn ranking to be considered as a match') parser.add_argument('--decayrate', type=float, default=1.01, help='for boosting') parser.add_argument('--init_vocab', type=int, default=10000, help='for boosting') parser.add_argument('--dictname', default='dict.tmp', help='output the dictionary') recommended_type = parser.add_argument_group( 'recommended settings', 'Recommended settings for different scenarios') recommended_type.add_argument( '--supervised', metavar='DICTIONARY', help='recommended if you have a large training dictionary') recommended_type.add_argument( '--identical', default=True, help= 'recommended if you have no seed dictionary but can rely on identical words' ) init_group = parser.add_argument_group( 'advanced initialization arguments', 'Advanced initialization arguments') init_type = init_group.add_mutually_exclusive_group() init_type.add_argument( '-d', '--init_dictionary', default=sys.stdin.fileno(), metavar='DICTIONARY', help='the training dictionary file (defaults to stdin)') init_type.add_argument('--init_identical', action='store_true', help='use identical words as the seed dictionary') init_type.add_argument( '--init_numerals', action='store_true', help= 'use latin numerals (i.e. words matching [0-9]+) as the seed dictionary' ) init_type.add_argument('--init_unsupervised', action='store_true', help='use unsupervised initialization') init_group.add_argument( '--unsupervised_vocab', type=int, default=0, help= 'restrict the vocabulary to the top k entries for unsupervised initialization' ) mapping_group = parser.add_argument_group( 'advanced mapping arguments', 'Advanced embedding mapping arguments') mapping_group.add_argument( '--normalize', choices=['unit', 'center', 'unitdim', 'centeremb', 'none'], nargs='*', default=[], help='the normalization actions to perform in order') mapping_group.add_argument('--vocabulary', help='restrict source vocab') mapping_type = mapping_group.add_mutually_exclusive_group() mapping_type.add_argument('-c', '--orthogonal', action='store_true', help='use orthogonal constrained mapping') mapping_type.add_argument('-u', '--unconstrained', action='store_true', help='use unconstrained mapping') self_learning_group = parser.add_argument_group( 'advanced self-learning arguments', 'Advanced arguments for self-learning') self_learning_group.add_argument( '--vocabulary_cutoff', type=int, default=0, help='restrict the vocabulary to the top k entries') self_learning_group.add_argument('--csls', type=int, nargs='?', default=0, const=10, metavar='NEIGHBORHOOD_SIZE', dest='csls_neighborhood', help='use CSLS for dictionary induction') self_learning_group.add_argument( '--validation', default=None, metavar='DICTIONARY', help='a dictionary file for validation at each iteration') self_learning_group.add_argument( '--log', help='write to a log file in tsv format at each iteration') self_learning_group.add_argument( '-v', '--verbose', action='store_true', help='write log information to stderr at each iteration') args = parser.parse_args() parser.set_defaults(init_dictionary=args.supervised, normalize=['unit', 'center', 'unit']) args = parser.parse_args() print(args, file=sys.stderr) # Choose the right dtype for the desired precision if args.precision == 'fp16': dtype = 'float16' elif args.precision == 'fp32': dtype = 'float32' elif args.precision == 'fp64': dtype = 'float64' os.makedirs(OUTPUTDIR, exist_ok=True) # Read input embeddings vocabulary = None if args.vocabulary is not None: vocabulary = set() with open(args.vocabulary, encoding=args.encoding, errors='surrogateescape') as file: for l in file: vocabulary.add(l.split()[0]) print(f'vocab size:\t{len(vocabulary)}') with open(args.src_input, encoding=args.encoding, errors='surrogateescape') as srcfile, \ open(args.trg_input, encoding=args.encoding, errors='surrogateescape') as trgfile: src_words, x = embeddings.read(srcfile, dtype=dtype, threshold=args.vocabulary_cutoff, vocabulary=vocabulary) trg_words, z = embeddings.read(trgfile, dtype=dtype, threshold=args.vocabulary_cutoff) embeddings.normalize(x, args.normalize) embeddings.normalize(z, args.normalize) # NumPy/CuPy management if args.cuda: if not supports_cupy(): print('ERROR: Install CuPy for CUDA support', file=sys.stderr) sys.exit(-1) xp = get_cupy() x = xp.asarray(x) z = xp.asarray(z) else: xp = np xp.random.seed(args.seed) # Build word to index map src_word2ind = {word: i for i, word in enumerate(src_words)} trg_word2ind = {word: i for i, word in enumerate(trg_words)} # Build the seed dictionary src_indices = [] trg_indices = [] if args.supervised: f = open(args.init_dictionary, encoding=args.encoding, errors='surrogateescape') for line in f: try: src, trg = line.split()[:2] except ValueError: continue try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src_indices.append(src_ind) trg_indices.append(trg_ind) except KeyError: print('WARNING: OOV dictionary entry ({0} - {1})'.format( src, trg), file=sys.stderr) # Read validation dictionary if args.validation is not None: print('reading validation', file=sys.stderr) f = open(args.validation, encoding=args.encoding, errors='surrogateescape') validation = collections.defaultdict(set) oov = set() vocab = set() for line in f: try: src, trg = line.split() except ValueError: continue try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] validation[src_ind].add(trg_ind) vocab.add(src) except KeyError: oov.add(src) oov -= vocab # If one of the translation options is in the vocabulary, then the entry is not an oov validation_coverage = len(validation) / (len(validation) + len(oov)) # Create log file if args.log: log = open(args.log, mode='w', encoding=args.encoding, errors='surrogateescape') # Allocate memory xw = xp.empty_like(x) zw = xp.empty_like(z) matches = collections.Counter() decided = collections.Counter() cum_weights = collections.Counter(matches) score = collections.Counter() for p in zip(src_indices, trg_indices): matches[p] = 1 decided[p] = 1 identical = set(src_words).intersection(set(trg_words)) for word in list(identical): p = (src_word2ind[word], trg_word2ind[word]) matches[p] = 1 decided[p] = 1 if args.validation is not None: simval = xp.empty((len(validation.keys()), z.shape[0]), dtype=dtype) # Training loop it = 1 t = time.time() wprev = 0 current_vocab = args.init_vocab Stats = collections.namedtuple( 'MatchStats', ['w_dot', 'mean_dot', 'delta_w', 'current_vocab', 'len_match']) pstats = None stats = None while True: src_indices, trg_indices, weights = flatten_match(matches, matches) # x, z = np.array(x0), np.array(z0) embeddings.noise(x) embeddings.noise(z) if args.unconstrained: w = np.linalg.lstsq(np.sqrt(weights) * x[src_indices], np.sqrt(weights) * z[trg_indices], rcond=None)[0] # w = np.linalg.lstsq(x[src_indices], z[trg_indices], rcond=None)[0] x.dot(w, out=xw) zw = z[:] else: u, s, vt = xp.linalg.svd( (weights * z[trg_indices]).T.dot(x[src_indices])) # u, s, vt = xp.linalg.svd(z[trg_indices].T.dot(x[src_indices])) w = vt.T.dot(u.T) x.dot(w, out=xw) zw = z[:] w_dot = np.sum( weights * z[trg_indices] * xw[src_indices]) / weights.sum() mean_dot = np.sum( z[trg_indices] * xw[src_indices]) / len(src_indices) delta_w = np.linalg.norm(w - wprev) stats = Stats(w_dot=w_dot, mean_dot=mean_dot, delta_w=delta_w, current_vocab=current_vocab, len_match=len(src_indices)) if it > 1 and stats.w_dot < pstats.w_dot: current_vocab = min(int(current_vocab * 1.1), args.vocabulary_cutoff) T = 1 * np.exp((it - 1) * np.log(1e-2) / (args.maxiter)) # T = 1 score = collections.Counter() cum_weights = collections.Counter() matches, objective = find_matches(xw, zw, cum_weights, score, ul=current_vocab, T=T, kbest=args.corekbest, csls=args.csls_neighborhood, decay=args.decayrate) for m in decided: decided[m] = decided[m] * (1 - 1 / it) for m in score: if m in score: eta = 1 / it else: eta = max(0.5, 1 / it) decided[m] = decided[m] * (1 - eta) + score[m] * eta # Accuracy and similarity evaluation in validation if args.validation is not None: src = list(validation.keys()) xw[src].dot(zw.T, out=simval) nn = asnumpy(simval.argmax(axis=1)) accuracy = np.mean([ 1 if nn[i] in validation[src[i]] else 0 for i in range(len(src)) ]) similarity = np.mean([ np.max([simval[i, j].tolist() for j in validation[src[i]]]) for i in range(len(src)) ]) with open(f'{OUTPUTDIR}/{args.dictname}.{it}', mode='w') as f: for p in decided.most_common(): si, ti = p[0] print(f'{src_words[si]}\t{trg_words[ti]}\t{p[1]:.3e}', file=f) # Logging duration = time.time() - t if args.verbose: print(file=sys.stderr) print('ITERATION {0} ({1:.2f}s)'.format(it, duration), file=sys.stderr) print('\t- Objective: {0:9.4f}%'.format(100 * objective), file=sys.stderr) print( f'\t- #match/#decided: {len(src_indices)}/{len(decided)}', file=sys.stderr) print(stats, file=sys.stderr) if args.validation is not None: print('\t- Val. similarity: {0:9.4f}%'.format(100 * similarity), file=sys.stderr) print('\t- Val. accuracy: {0:9.4f}%'.format(100 * accuracy), file=sys.stderr) print('\t- Val. coverage: {0:9.4f}%'.format( 100 * validation_coverage), file=sys.stderr) sys.stderr.flush() if args.log is not None: val = '{0:.6f}\t{1:.6f}\t{2:.6f}'.format( 100 * similarity, 100 * accuracy, 100 * validation_coverage) if args.validation is not None else '' print('{0}\t{1:.6f}\t{2}\t{3:.6f}'.format(it, 100 * objective, val, duration), file=log) log.flush() if it >= args.maxiter: break t = time.time() wprev = w pstats = stats it += 1 # write mapped embeddings print('**** reading and writing final embeddings ****', file=sys.stderr) with open(args.src_input, encoding=args.encoding, errors='surrogateescape') as srcfile, \ open(args.trg_input, encoding=args.encoding, errors='surrogateescape') as trgfile: src_words, x = embeddings.read(srcfile, dtype=dtype, threshold=100000) trg_words, z = embeddings.read(trgfile, dtype=dtype, threshold=100000) embeddings.normalize(x, args.normalize) embeddings.normalize(z, args.normalize) with open(args.src_output, mode='w', encoding=args.encoding, errors='surrogateescape') as srcfile, \ open(args.trg_output, mode='w', encoding=args.encoding, errors='surrogateescape') as trgfile: embeddings.write(src_words, x.dot(w), srcfile) embeddings.write(trg_words, z, trgfile)