Exemplo n.º 1
0
@site: http://muyeby.github.io
@software: PyCharm
@file: GetTrainVec.py.py
@time: 17-12-18 上午10:11
"""

import numpy as np
import embeddings
import sys
from sklearn import preprocessing

if __name__ == "__main__":

    source_file = open(sys.argv[1], encoding='utf-8', errors='surrogateescape')
    target_file = open(sys.argv[2], encoding='utf-8', errors='surrogateescape')
    en_words, en_vec = embeddings.read(source_file)
    de_words, de_vec = embeddings.read(target_file)

    src_word2ind = {word: i for i, word in enumerate(en_words)}
    trg_word2ind = {word: i for i, word in enumerate(de_words)}

    src_indices = []
    trg_indices = []
    src_words = []
    trg_words = []

    f = open(sys.argv[3], encoding='utf-8', errors='surrogateescape')
    for line in f:
        src, trg = line.split()
        try:
            src_words.append(src)
Exemplo n.º 2
0
def evaluate(src_emb_fname,
             tgt_emb_fname,
             dict_fname,
             max_voc=0,
             retrieval_method="csls",
             csls_k=10,
             batch_size=2500):

    print('Loading train data...')

    srcfile = open(src_emb_fname,
                   'r',
                   encoding='utf-8',
                   errors='surrogateescape')
    tgtfile = open(tgt_emb_fname,
                   'r',
                   encoding='utf-8',
                   errors='surrogateescape')

    # Read source embeddings
    src_words, x = embeddings.read(srcfile, max_voc=max_voc, dtype='float32')
    src_word2ind = {word: i for i, word in enumerate(src_words)}

    # Read target embeddings
    tgt_words, z = embeddings.read(tgtfile, max_voc=max_voc, dtype='float32')
    tgt_word2ind = {word: i for i, word in enumerate(tgt_words)}

    srcfile.close()
    tgtfile.close()

    xw = embeddings.length_normalize(x)
    zw = embeddings.length_normalize(z)

    # Loading test dictionary
    f = open(dict_fname, encoding='utf-8', errors='surrogateescape')
    src2trg = collections.defaultdict(set)
    trg2src = collections.defaultdict(set)
    oov = set()
    vocab = set()
    for line in f:
        src, trg = line.split()
        if args.max_vocab:
            src = src.lower()
            trg = trg.lower()
        try:
            src_ind = src_word2ind[src]
            trg_ind = trg_word2ind[trg]
            src2trg[src_ind].add(trg_ind)
            trg2src[trg_ind].add(src_ind)
            vocab.add(src)
        except KeyError:
            oov.add(src)
    src = list(src2trg.keys())
    trgt = list(trg2src.keys())

    oov -= vocab  # If one of the translation options is in the vocabulary, then the entry is not an oov
    coverage = len(src2trg) / (len(src2trg) + len(oov))
    f.close()

    ### get translations
    translation = collections.defaultdict(int)
    translation5 = collections.defaultdict(list)
    translation10 = collections.defaultdict(list)

    if retrieval_method == 'nn':  # Standard nearest neighbor
        for i in range(0, len(src), batch_size):
            j = min(i + batch_size, len(src))
            similarities = xw[src[i:j]].dot(zw.T)
            nn = similarities.argmax(axis=1).tolist()
            similarities_idx = similarities.argsort(axis=1)
            nn5 = similarities_idx[:, -5:]
            nn10 = similarities_idx[:, -10:]

            for k in range(j - i):
                translation[src[i + k]] = nn[k]
                translation5[src[i + k]] = nn5[k]
                translation10[src[i + k]] = nn10[k]

    elif retrieval_method == 'csls':
        t = time.time()
        nbrhood_x = np.zeros(xw.shape[0])
        nbrhood_z = np.zeros(zw.shape[0])
        nbrhood_z2 = cp.zeros(zw.shape[0])
        print('Computing X Neighbourhood')
        # batch_size=1000
        for i in range(0, len(src), batch_size):
            j = min(i + batch_size, len(src))
            similarities = xw[src[i:j]].dot(zw.T)
            # similarities_x = np.sort(similarities, axis=1)
            similarities_x = -1 * np.partition(
                -1 * similarities, csls_k - 1, axis=1)
            #similarities_x = -1*cp.partition(-1*cp.dot(cp.asarray(xw[src[i:j]]),cp.transpose(cp.asarray(zw))),csls_k-1 ,axis=1)[:,:csls_k]
            nbrhood_x[src[i:j]] = np.mean(similarities_x[:, :csls_k], axis=1)
        print('Completed in {0} seconds'.format(time.time() - t))
        print('Computing Z Neighbourhood')

        batch_num = 1
        for i in range(0, zw.shape[0], batch_size):
            j = min(i + batch_size, zw.shape[0])
            similarities = -1 * cp.partition(
                -1 * cp.dot(cp.asarray(zw[i:j]), cp.transpose(cp.asarray(xw))),
                csls_k - 1,
                axis=1)[:, :csls_k]
            nbrhood_z2[i:j] = (cp.mean(similarities[:, :csls_k], axis=1))
            print('Completed batch {0} in {1}'.format(batch_num,
                                                      time.time() - t))
            batch_num += 1
        # gc.collect()
        # t=time.time()
        nbrhood_z = cp.asnumpy(nbrhood_z2)
        # ipdb.set_trace()
        print(time.time() - t)
        csls_alpha = 1
        for i in range(0, len(src), batch_size):
            j = min(i + batch_size, len(src))
            similarities = xw[src[i:j]].dot(zw.T)
            similarities = np.transpose(
                np.transpose(2 * similarities) -
                csls_alpha * nbrhood_x[src[i:j]]) - csls_alpha * nbrhood_z
            nn = similarities.argmax(axis=1).tolist()
            print(time.time() - t)
            similarities = np.argsort((similarities), axis=1)

            nn5 = (similarities[:, -5:])
            nn10 = (similarities[:, -10:])
            for k in range(j - i):
                translation[src[i + k]] = nn[k]
                translation5[src[i + k]] = nn5[k]
                translation10[src[i + k]] = nn10[k]
        print('Completed in {0} seconds'.format(time.time() - t))

    #### write the translations (1 pair per line format)
    #with open(trans_tgt_fname, 'w', encoding='utf-8', errors='surrogateescape') as trans_tgt_file:
    #    for w in trans_words:
    #        trans=''
    #        if w in src_word2ind:
    #            trans=tgt_words[translation[src_word2ind[w]]]
    #        trans_tgt_file.write('{}\t{}\n'.format(w,trans))

    # evaluation metrics
    accuracy = np.mean([1 if translation[i] in src2trg[i] else 0 for i in src])
    mean = 0
    for i in src:
        for k in translation5[i]:
            if k in src2trg[i]:
                mean += 1
                break

    mean /= len(src)
    accuracy5 = mean

    mean = 0
    for i in src:
        for k in translation10[i]:
            if k in src2trg[i]:
                mean += 1
                break

    mean /= len(src)
    accuracy10 = mean
    print(
        'Coverage:{0:7.2%}  Accuracy:{1:7.2%}  Accuracy(Top 5):{2:7.2%}  Accuracy(Top 10):{3:7.2%}'
        .format(coverage, accuracy, accuracy5, accuracy10))
Exemplo n.º 3
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(description='Evaluate embeddings of two languages in a shared space in word translation induction')
    parser.add_argument('src_embeddings', help='the source language embeddings')
    parser.add_argument('trg_embeddings', help='the target language embeddings')
    parser.add_argument('-d', '--dictionary', default=sys.stdin.fileno(), help='the test dictionary file (defaults to stdin)')
    parser.add_argument('--retrieval', default='nn', choices=['nn', 'topk', 'invnn', 'invsoftmax', 'csls'], help='the retrieval method (nn: standard nearest neighbor; invnn: inverted nearest neighbor; invsoftmax: inverted softmax; csls: cross-domain similarity local scaling)')
    parser.add_argument('--inv_temperature', default=1, type=float, help='the inverse temperature (only compatible with inverted softmax)')
    parser.add_argument('--inv_sample', default=None, type=int, help='use a random subset of the source vocabulary for the inverse computations (only compatible with inverted softmax)')
    parser.add_argument('-k', '--neighborhood', default=10, type=int, help='the neighborhood size (only compatible with csls)')
    parser.add_argument('--dot', action='store_true', help='use the dot product in the similarity computations instead of the cosine')
    # parser.add_argument('--mean', action='store_true', help='Mean center the target.')
    parser.add_argument('--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)')
    parser.add_argument('--seed', type=int, default=0, help='the random seed')
    parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp32', help='the floating-point precision (defaults to fp32)')
    parser.add_argument('--cuda', action='store_true', help='use cuda (requires cupy)')
    args = parser.parse_args()

    # Choose the right dtype for the desired precision
    if args.precision == 'fp16':
        dtype = 'float16'
    elif args.precision == 'fp32':
        dtype = 'float32'
    elif args.precision == 'fp64':
        dtype = 'float64'

    # KNN neighborhood for MRR.
    knn = args.neighborhood

    # Read input embeddings
    srcfile = open(args.src_embeddings, encoding=args.encoding, errors='surrogateescape')
    trgfile = open(args.trg_embeddings, encoding=args.encoding, errors='surrogateescape')
    src_words, x = embeddings.read(srcfile, dtype=dtype)
    trg_words, z = embeddings.read(trgfile, dtype=dtype)

    # NumPy/CuPy management
    if args.cuda:
        if not supports_cupy():
            print('ERROR: Install CuPy for CUDA support', file=sys.stderr)
            sys.exit(-1)
        xp = get_cupy()
        x = xp.asarray(x)
        z = xp.asarray(z)
    else:
        xp = np
    xp.random.seed(args.seed)

    # Length normalize embeddings so their dot product effectively computes the cosine similarity
    if not args.dot:
        embeddings.length_normalize(x)
        embeddings.length_normalize(z)

    # if args.mean:
    #     print(args.mean)
    #     print("Mean Center....")
    #     embeddings.mean_center(x)
    #     embeddings.mean_center(z)


    # Build word to index map
    src_word2ind = {word: i for i, word in enumerate(src_words)}
    trg_word2ind = {word: i for i, word in enumerate(trg_words)}

    # Read dictionary and compute coverage
    f = open(args.dictionary, encoding=args.encoding, errors='surrogateescape')
    src2trg = collections.defaultdict(set)
    count = 0
    oov = set()
    vocab = set()
    for line in f:
        src, trg = line.split()
        try:
            src_ind = src_word2ind[src]
            trg_ind = trg_word2ind[trg]
            src2trg[src_ind].add(trg_ind)
            vocab.add(src)
        except KeyError:
            oov.add(src)
    src = list(src2trg.keys())
    oov -= vocab  # If one of the translation options is in the vocabulary, then the entry is not an oov
    coverage = len(src2trg) / (len(src2trg) + len(oov))

    # Find translations
    translation = collections.defaultdict(list)
    if args.retrieval == 'nn':  # Standard nearest neighbor
        for i in range(0, len(src), BATCH_SIZE):
            j = min(i + BATCH_SIZE, len(src))
            similarities = x[src[i:j]].dot(z.T)
            nn = similarities.argmax(axis=1).tolist()
            for k in range(j-i):
                translation[src[i+k]].append(nn[k])
    elif args.retrieval == 'invnn':  # Inverted nearest neighbor
        best_rank = np.full(len(src), x.shape[0], dtype=int)
        best_sim = np.full(len(src), -100, dtype=dtype)
        for i in range(0, z.shape[0], BATCH_SIZE):
            j = min(i + BATCH_SIZE, z.shape[0])
            similarities = z[i:j].dot(x.T)
            ind = (-similarities).argsort(axis=1)
            ranks = asnumpy(ind.argsort(axis=1)[:, src])
            sims = asnumpy(similarities[:, src])
            for k in range(i, j):
                for l in range(len(src)):
                    rank = ranks[k-i, l]
                    sim = sims[k-i, l]
                    if rank < best_rank[l] or (rank == best_rank[l] and sim > best_sim[l]):
                        best_rank[l] = rank
                        best_sim[l] = sim
                        translation[src[l]].append(nn[k])
    # Added by Ashwinkumar Ganesan.
    elif args.retrieval == 'topk':  # Calculate mean reciprocal ranking.
        for i in range(0, len(src), BATCH_SIZE):
            j = min(i + BATCH_SIZE, len(src))
            similarities = x[src[i:j]].dot(z.T)
            nn = similarities.argsort(axis=1).tolist()
            for k in range(j-i):
                translation[src[i+k]] = nn[k][-knn: ]

    # Continue as usual.
    elif args.retrieval == 'invsoftmax':  # Inverted softmax
        sample = xp.arange(x.shape[0]) if args.inv_sample is None else xp.random.randint(0, x.shape[0], args.inv_sample)
        partition = xp.zeros(z.shape[0])
        for i in range(0, len(sample), BATCH_SIZE):
            j = min(i + BATCH_SIZE, len(sample))
            partition += xp.exp(args.inv_temperature*z.dot(x[sample[i:j]].T)).sum(axis=1)
        for i in range(0, len(src), BATCH_SIZE):
            j = min(i + BATCH_SIZE, len(src))
            p = xp.exp(args.inv_temperature*x[src[i:j]].dot(z.T)) / partition
            nn = p.argmax(axis=1).tolist()
            for k in range(j-i):
                translation[src[i+k]].append(nn[k])
    elif args.retrieval == 'csls':  # Cross-domain similarity local scaling
        knn_sim_bwd = xp.zeros(z.shape[0])
        for i in range(0, z.shape[0], BATCH_SIZE):
            j = min(i + BATCH_SIZE, z.shape[0])
            knn_sim_bwd[i:j] = topk_mean(z[i:j].dot(x.T), k=args.neighborhood, inplace=True)
        for i in range(0, len(src), BATCH_SIZE):
            j = min(i + BATCH_SIZE, len(src))
            similarities = 2*x[src[i:j]].dot(z.T) - knn_sim_bwd  # Equivalent to the real CSLS scores for NN
            nn = similarities.argmax(axis=1).tolist()
            for k in range(j-i):
                translation[src[i+k]].append(nn[k])

    # Compute accuracy
    accuracy = np.mean([1 if len(set(translation[i]) & set(src2trg[i])) > 0 else 0 for i in src])
    print('KNN: {0:} Coverage:{1:7.2%}  Accuracy:{2:7.2%}'.format(knn, coverage, np.mean(accuracy)))
Exemplo n.º 4
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(description='Evaluate embeddings in word similarity/relatedness')
    parser.add_argument('src_embeddings', help='the source language embeddings')
    parser.add_argument('trg_embeddings', nargs='?', help='the target language embeddings')
    parser.add_argument('-i', '--input', default=[sys.stdin.fileno()], nargs='+', help='the input datasets (defaults to stdin)')
    parser.add_argument('-l', '--lowercase', action='store_true', help='lowercase the words in the test files')
    parser.add_argument('--backoff', default=None, type=float, help='use a backoff similarity score for OOV entries')
    parser.add_argument('--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)')
    parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp32', help='the floating-point precision (defaults to fp32)')
    parser.add_argument('--sim', nargs='*', help='the names of the datasets to include in the similarity results')
    parser.add_argument('--rel', nargs='*', help='the names of the datasets to include in the relatedness results')
    parser.add_argument('--all', nargs='*', help='the names of the datasets to include in the total results')
    args = parser.parse_args()

    # Choose the right dtype for the desired precision
    if args.precision == 'fp16':
        dtype = 'float16'
    elif args.precision == 'fp32':
        dtype = 'float32'
    elif args.precision == 'fp64':
        dtype = 'float64'

    # Parse test files
    word_pairs = []
    golds = []
    for filename in args.input:
        f = open(filename, encoding=args.encoding, errors='surrogateescape')
        word_pairs.append([])
        golds.append([])
        for line in f:
            if args.lowercase:
                line = line.lower()
            src, trg, score = line.split('\t')
            word_pairs[-1].append((src, trg))
            # shape like [(src, trg), (src, trg), ... ]
            golds[-1].append(float(score))
            # shape like [score, score, score ...]

    # Build vocabularies
    src_vocab = {pair[0] for pairs in word_pairs for pair in pairs}
    trg_vocab = {pair[1] for pairs in word_pairs for pair in pairs}

    # Read embeddings
    srcfile = open(args.src_embeddings, encoding=args.encoding, errors='surrogateescape')
    trgfile = open(args.src_embeddings if args.trg_embeddings is None else args.trg_embeddings, encoding=args.encoding, errors='surrogateescape')
    src_words, src_matrix = embeddings.read(srcfile, vocabulary=src_vocab, dtype=dtype)
    trg_words, trg_matrix = embeddings.read(trgfile, vocabulary=trg_vocab, dtype=dtype)

    # Length normalize embeddings so their dot product effectively computes the cosine similarity
    embeddings.length_normalize(src_matrix)
    embeddings.length_normalize(trg_matrix)

    # Build word to index map
    src_word2ind = {word: i for i, word in enumerate(src_words)}
    trg_word2ind = {word: i for i, word in enumerate(trg_words)}

    # Compute system scores and correlations
    results = []
    for i in range(len(golds)):
        system = []
        gold = []
        oov = 0
        for gold_score, (src, trg) in zip(golds[i], word_pairs[i]):
            try:
                cos = np.dot(src_matrix[src_word2ind[src]], trg_matrix[trg_word2ind[trg]])
                system.append(cos)
                gold.append(gold_score)
            except KeyError:
                if args.backoff is None:
                    oov += 1
                else:
                    system.append(args.backoff)
                    gold.append(gold_score)
        name = os.path.splitext(os.path.basename(args.input[i]))[0]
        #os.path.splitext : split the path name path into a pair (root, ext)
        #for example : 'en-de.txt' --> 'en-de', '.txt'
        #os.path.basename : return the base name of path name path
        #for example : './data/en-de.txt' --> 'en-de.txt'
        coverage = len(system) / (len(system) + oov)
        pearson = scipy.stats.pearsonr(gold, system)[0]
        # Calculate a Pearson correlation coefficient and the p-value for testing non-correlation.
        spearman = scipy.stats.spearmanr(gold, system)[0]
        # Calculate a Spearman rank-order correlation coefficient and the p-value to test for non-correlation.
        results.append((name, coverage, pearson, spearman))
        print('Coverage:{0:7.2%}  Pearson:{1:7.2%}  Spearman:{2:7.2%} | {3}'.format(coverage, pearson, spearman, name))

    # Compute and print total (averaged) results
    # if there're multi-input testfile
    if len(results) > 1:
        print('-'*80)
        if args.sim is not None:
            sim = list(zip(*[res for res in results if res[0] in args.sim]))
            print('Coverage:{0:7.2%}  Pearson:{1:7.2%}  Spearman:{2:7.2%} | sim.'.format(np.mean(sim[1]), np.mean(sim[2]), np.mean(sim[3])))
        if args.rel is not None:i
            rel = list(zip(*[res for res in results if res[0] in args.rel]))
            print('Coverage:{0:7.2%}  Pearson:{1:7.2%}  Spearman:{2:7.2%} | rel.'.format(np.mean(rel[1]), np.mean(rel[2]), np.mean(rel[3])))
        if args.all is not None:
            results = [res for res in results if res[0] in args.all]
        results = list(zip(*results))
        # zip(*result) : unzip
        print('Coverage:{0:7.2%}  Pearson:{1:7.2%}  Spearman:{2:7.2%} | all'.format(np.mean(results[1]), np.mean(results[2]), np.mean(results[3])))
Exemplo n.º 5
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(
        description='Evaluate embeddings in word analogy')
    parser.add_argument('embeddings', help='the word embeddings')
    parser.add_argument(
        '-t',
        '--threshold',
        type=int,
        default=0,
        help=
        'reduce vocabulary of the model for fast approximate evaluation (0 = off, otherwise typical value is 30,000)'
    )
    parser.add_argument('-i',
                        '--input',
                        default=sys.stdin.fileno(),
                        help='the test file (defaults to stdin)')
    parser.add_argument('-v',
                        '--verbose',
                        action='store_true',
                        help='verbose output (give category specific results)')
    parser.add_argument('-l',
                        '--lowercase',
                        action='store_true',
                        help='lowercase the words in the test file')
    parser.add_argument(
        '--encoding',
        default='utf-8',
        help='the character encoding for input/output (defaults to utf-8)')
    parser.add_argument('--precision',
                        choices=['fp16', 'fp32', 'fp64'],
                        default='fp64',
                        help='the floating-point precision (defaults to fp64)')
    args = parser.parse_args()

    # Choose the right dtype for the desired precision
    if args.precision == 'fp16':
        dtype = 'float16'
    elif args.precision == 'fp32':
        dtype = 'float32'
    elif args.precision == 'fp64':
        dtype = 'float64'

    # Read input embeddings
    f = open(args.embeddings, encoding=args.encoding, errors='surrogateescape')
    words, matrix = embeddings.read(f, threshold=args.threshold, dtype=dtype)

    # Build word to index map
    word2ind = {word: i for i, word in enumerate(words)}

    # Length normalize embeddings
    matrix = embeddings.length_normalize(matrix)

    # Parse test file
    f = open(args.input, encoding=args.encoding, errors='surrogateescape')
    categories = []
    src1 = []
    trg1 = []
    src2 = []
    trg2 = []
    for line in f:
        if line.startswith(': '):
            name = line[2:-1]
            is_syntactic = name.startswith('gram')
            categories.append({
                'name': name,
                'is_syntactic': is_syntactic,
                'total': 0,
                'oov': 0
            })
        else:
            try:
                ind = [
                    word2ind[word.lower() if args.lowercase else word]
                    for word in line.split()
                ]
                src1.append(ind[0])
                trg1.append(ind[1])
                src2.append(ind[2])
                trg2.append(ind[3])
                categories[-1]['total'] += 1
            except KeyError:
                categories[-1]['oov'] += 1
    total = len(src1)

    # Compute nearest neighbors using efficient matrix multiplication
    nn = []
    for i in range(0, total, BATCH_SIZE):
        j = min(i + BATCH_SIZE, total)
        similarities = (matrix[src2[i:j]] - matrix[src1[i:j]] +
                        matrix[trg1[i:j]]).dot(matrix.T)
        similarities[range(j - i), src1[i:j]] = -1
        similarities[range(j - i), trg1[i:j]] = -1
        similarities[range(j - i), src2[i:j]] = -1
        nn += np.argmax(similarities, axis=1).tolist()
    nn = np.array(nn)

    # Compute and print accuracies
    semantic = {'correct': 0, 'total': 0, 'oov': 0}
    syntactic = {'correct': 0, 'total': 0, 'oov': 0}
    ind = 0
    for category in categories:
        current = syntactic if category['is_syntactic'] else semantic
        correct = np.sum(nn[ind:ind +
                            category['total']] == trg2[ind:ind +
                                                       category['total']])
        current['correct'] += correct
        current['total'] += category['total']
        current['oov'] += category['oov']
        ind += category['total']
        if args.verbose:
            print('Coverage:{0:7.2%}  Accuracy:{1:7.2%} | {2}'.format(
                category['total'] / (category['total'] + category['oov']),
                correct / category['total'], category['name']))
    if args.verbose:
        print('-' * 80)
    print('Coverage:{0:7.2%}  Accuracy:{1:7.2%} (sem:{2:7.2%}, syn:{3:7.2%})'.
          format((semantic['total'] + syntactic['total']) /
                 (semantic['total'] + syntactic['total'] + semantic['oov'] +
                  syntactic['oov']),
                 (semantic['correct'] + syntactic['correct']) /
                 (semantic['total'] + syntactic['total']),
                 semantic['correct'] / semantic['total'],
                 syntactic['correct'] / syntactic['total']))
Exemplo n.º 6
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(
        description='Map the source embeddings into the target embedding space'
    )
    parser.add_argument('src_input', help='the input source embeddings')
    parser.add_argument('trg_input', help='the input target embeddings')
    parser.add_argument('--model_path',
                        default=None,
                        type=str,
                        help='directory to save the model')
    parser.add_argument(
        '--geomm_embeddings_path',
        default=None,
        type=str,
        help=
        'directory to save the output GeoMM latent space embeddings. The output embeddings are normalized.'
    )
    parser.add_argument(
        '--encoding',
        default='utf-8',
        help='the character encoding for input/output (defaults to utf-8)')
    parser.add_argument(
        '--max_vocab',
        default=0,
        type=int,
        help='Maximum vocabulary to be loaded, 0 allows complete vocabulary')
    parser.add_argument('--verbose', default=0, type=int, help='Verbose')
    mapping_group = parser.add_argument_group(
        'mapping arguments', 'Basic embedding mapping arguments')
    mapping_group.add_argument(
        '-dtrain',
        '--dictionary_train',
        default=sys.stdin.fileno(),
        help='the training dictionary file (defaults to stdin)')
    mapping_group.add_argument(
        '-dtest',
        '--dictionary_test',
        default=sys.stdin.fileno(),
        help='the test dictionary file (defaults to stdin)')
    mapping_group.add_argument(
        '--normalize',
        choices=['unit', 'center', 'unitdim', 'centeremb'],
        nargs='*',
        default=[],
        help='the normalization actions to perform in order')

    geomm_group = parser.add_argument_group('GeoMM arguments',
                                            'Arguments for GeoMM method')
    geomm_group.add_argument('--l2_reg',
                             type=float,
                             default=1e2,
                             help='Lambda for L2 Regularization')
    geomm_group.add_argument(
        '--max_opt_time',
        type=int,
        default=5000,
        help='Maximum time limit for optimization in seconds')
    geomm_group.add_argument(
        '--max_opt_iter',
        type=int,
        default=150,
        help='Maximum number of iterations for optimization')

    eval_group = parser.add_argument_group('evaluation arguments',
                                           'Arguments for evaluation')
    eval_group.add_argument('--normalize_eval',
                            action='store_true',
                            help='Normalize the embeddings at test time')
    eval_group.add_argument('--eval_batch_size',
                            type=int,
                            default=1000,
                            help='Batch size for evaluation')
    eval_group.add_argument('--csls_neighbourhood',
                            type=int,
                            default=10,
                            help='Neighbourhood size for CSLS')

    args = parser.parse_args()
    BATCH_SIZE = args.eval_batch_size

    ## Logging
    #method_name = os.path.join('logs','geomm')
    #directory = os.path.join(os.path.join(os.getcwd(),method_name), datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
    #if not os.path.exists(directory):
    #    os.makedirs(directory)
    #log_file_name, file_extension = os.path.splitext(os.path.basename(args.dictionary_train))
    #log_file_name = log_file_name + '.log'
    #class Logger(object):
    #    def __init__(self):
    #        self.terminal = sys.stdout
    #        self.log = open(os.path.join(directory,log_file_name), "a")

    #    def write(self, message):
    #        self.terminal.write(message)
    #        self.log.write(message)

    #    def flush(self):
    #        #this flush method is needed for python 3 compatibility.
    #        #this handles the flush command by doing nothing.
    #        #you might want to specify some extra behavior here.
    #        pass
    #sys.stdout = Logger()
    if args.verbose:
        print('Current arguments: {0}'.format(args))

    dtype = 'float32'
    if args.verbose:
        print('Loading train data...')
    # Read input embeddings
    srcfile = open(args.src_input,
                   encoding=args.encoding,
                   errors='surrogateescape')
    trgfile = open(args.trg_input,
                   encoding=args.encoding,
                   errors='surrogateescape')
    src_words, x = embeddings.read(srcfile,
                                   max_voc=args.max_vocab,
                                   dtype=dtype)
    trg_words, z = embeddings.read(trgfile,
                                   max_voc=args.max_vocab,
                                   dtype=dtype)

    # Build word to index map
    src_word2ind = {word: i for i, word in enumerate(src_words)}
    trg_word2ind = {word: i for i, word in enumerate(trg_words)}

    # Build training dictionary
    noov = 0
    src_indices = []
    trg_indices = []
    f = open(args.dictionary_train,
             encoding=args.encoding,
             errors='surrogateescape')
    for line in f:
        src, trg = line.split()
        if args.max_vocab:
            src = src.lower()
            trg = trg.lower()
        try:
            src_ind = src_word2ind[src]
            trg_ind = trg_word2ind[trg]
            src_indices.append(src_ind)
            trg_indices.append(trg_ind)
        except KeyError:
            noov += 1
            if args.verbose:
                print('WARNING: OOV dictionary entry ({0} - {1})'.format(
                    src, trg))  #, file=sys.stderr
    f.close()
    if args.verbose:
        print('Number of training pairs having at least one OOV: {}'.format(
            noov))
    src_indices = src_indices
    trg_indices = trg_indices
    if args.verbose:
        print('Normalizing embeddings...')
    # STEP 0: Normalization
    for action in args.normalize:
        if action == 'unit':
            x = embeddings.length_normalize(x)
            z = embeddings.length_normalize(z)
        elif action == 'center':
            x = embeddings.mean_center(x)
            z = embeddings.mean_center(z)
        elif action == 'unitdim':
            x = embeddings.length_normalize_dimensionwise(x)
            z = embeddings.length_normalize_dimensionwise(z)
        elif action == 'centeremb':
            x = embeddings.mean_center_embeddingwise(x)
            z = embeddings.mean_center_embeddingwise(z)

    # Step 1: Optimization
    if args.verbose:
        print('Beginning Optimization')
    start_time = time.time()
    x_count = len(set(src_indices))
    z_count = len(set(trg_indices))
    A = np.zeros((x_count, z_count))

    # Creating dictionary matrix from training set
    map_dict_src = {}
    map_dict_trg = {}
    I = 0
    uniq_src = []
    uniq_trg = []
    for i in range(len(src_indices)):
        if src_indices[i] not in map_dict_src.keys():
            map_dict_src[src_indices[i]] = I
            I += 1
            uniq_src.append(src_indices[i])
    J = 0
    for j in range(len(trg_indices)):
        if trg_indices[j] not in map_dict_trg.keys():
            map_dict_trg[trg_indices[j]] = J
            J += 1
            uniq_trg.append(trg_indices[j])

    for i in range(len(src_indices)):
        A[map_dict_src[src_indices[i]], map_dict_trg[trg_indices[i]]] = 1

    np.random.seed(0)
    Lambda = args.l2_reg

    U1 = TT.matrix()
    U2 = TT.matrix()
    B = TT.matrix()

    cost = TT.sum(((shared(x[uniq_src]).dot(U1.dot(B.dot(U2.T)))).dot(
        shared(z[uniq_trg]).T) - A)**2) + 0.5 * Lambda * (TT.sum(B**2))

    solver = ConjugateGradient(maxtime=args.max_opt_time,
                               maxiter=args.max_opt_iter)

    manifold = Product([
        Stiefel(x.shape[1], x.shape[1]),
        Stiefel(z.shape[1], x.shape[1]),
        PositiveDefinite(x.shape[1])
    ])
    #manifold =Product([Stiefel(x.shape[1], 200),Stiefel(z.shape[1], 200),PositiveDefinite(200)])
    problem = Problem(manifold=manifold,
                      cost=cost,
                      arg=[U1, U2, B],
                      verbosity=3)
    wopt = solver.solve(problem)

    w = wopt
    U1 = w[0]
    U2 = w[1]
    B = w[2]

    ### Save the models if requested
    if args.model_path is not None:
        os.makedirs(args.model_path, exist_ok=True)
        np.savetxt('{}/U_src.csv'.format(args.model_path), U1)
        np.savetxt('{}/U_tgt.csv'.format(args.model_path), U2)
        np.savetxt('{}/B.csv'.format(args.model_path), B)

    # Step 2: Transformation
    xw = x.dot(U1).dot(scipy.linalg.sqrtm(B))
    zw = z.dot(U2).dot(scipy.linalg.sqrtm(B))

    end_time = time.time()
    if args.verbose:
        print('Completed training in {0:.2f} seconds'.format(end_time -
                                                             start_time))
    gc.collect()

    ### Save the GeoMM embeddings if requested
    xw_n = embeddings.length_normalize(xw)
    zw_n = embeddings.length_normalize(zw)
    if args.geomm_embeddings_path is not None:
        os.makedirs(args.geomm_embeddings_path, exist_ok=True)

        out_emb_fname = os.path.join(args.geomm_embeddings_path, 'src.vec')
        with open(out_emb_fname, 'w', encoding=args.encoding) as outfile:
            embeddings.write(src_words, xw_n, outfile)

        out_emb_fname = os.path.join(args.geomm_embeddings_path, 'trg.vec')
        with open(out_emb_fname, 'w', encoding=args.encoding) as outfile:
            embeddings.write(trg_words, zw_n, outfile)

    # Step 3: Evaluation
    if args.normalize_eval:
        xw = xw_n
        zw = zw_n

    X = xw[src_indices]
    Z = zw[trg_indices]

    # Loading test dictionary
    f = open(args.dictionary_test,
             encoding=args.encoding,
             errors='surrogateescape')
    src2trg = collections.defaultdict(set)
    trg2src = collections.defaultdict(set)
    oov = set()
    vocab = set()
    for line in f:
        src, trg = line.split()
        if args.max_vocab:
            src = src.lower()
            trg = trg.lower()
        try:
            src_ind = src_word2ind[src]
            trg_ind = trg_word2ind[trg]
            src2trg[src_ind].add(trg_ind)
            trg2src[trg_ind].add(src_ind)
            vocab.add(src)
        except KeyError:
            oov.add(src)
    src = list(src2trg.keys())
    trgt = list(trg2src.keys())

    oov -= vocab  # If one of the translation options is in the vocabulary, then the entry is not an oov
    coverage = len(src2trg) / (len(src2trg) + len(oov))
    f.close()

    translation = collections.defaultdict(int)
    translation5 = collections.defaultdict(list)
    translation10 = collections.defaultdict(list)

    ### compute nearest neigbours of x in z
    t = time.time()
    nbrhood_x = np.zeros(xw.shape[0])

    for i in range(0, len(src), BATCH_SIZE):
        j = min(i + BATCH_SIZE, len(src))
        similarities = xw[src[i:j]].dot(zw.T)
        similarities_x = -1 * np.partition(
            -1 * similarities, args.csls_neighbourhood - 1, axis=1)
        nbrhood_x[src[i:j]] = np.mean(
            similarities_x[:, :args.csls_neighbourhood], axis=1)

    ### compute nearest neigbours of z in x (GPU version)
    nbrhood_z = np.zeros(zw.shape[0])
    with cp.cuda.Device(0):
        nbrhood_z2 = cp.zeros(zw.shape[0])
        batch_num = 1
        for i in range(0, zw.shape[0], BATCH_SIZE):
            j = min(i + BATCH_SIZE, zw.shape[0])
            similarities = -1 * cp.partition(
                -1 * cp.dot(cp.asarray(zw[i:j]), cp.transpose(cp.asarray(xw))),
                args.csls_neighbourhood - 1,
                axis=1)[:, :args.csls_neighbourhood]
            nbrhood_z2[i:j] = (cp.mean(
                similarities[:, :args.csls_neighbourhood], axis=1))
            batch_num += 1
        nbrhood_z = cp.asnumpy(nbrhood_z2)

    #### compute nearest neigbours of z in x (CPU version)
    #nbrhood_z=np.zeros(zw.shape[0])
    #for i in range(0, len(zw.shape[0]), BATCH_SIZE):
    #    j = min(i + BATCH_SIZE, len(zw.shape[0]))
    #    similarities = zw[i:j].dot(xw.T)
    #    similarities_z = -1*np.partition(-1*similarities,args.csls_neighbourhood-1 ,axis=1)
    #    nbrhood_z[i:j]=np.mean(similarities_z[:,:args.csls_neighbourhood],axis=1)

    #### find translation
    #for i in range(0, len(src), BATCH_SIZE):
    #    j = min(i + BATCH_SIZE, len(src))
    #    similarities = xw[src[i:j]].dot(zw.T)
    #    similarities = np.transpose(np.transpose(2*similarities) - nbrhood_x[src[i:j]]) - nbrhood_z
    #    nn = similarities.argmax(axis=1).tolist()
    #    similarities = np.argsort((similarities),axis=1)

    #    nn5 = (similarities[:,-5:])
    #    nn10 = (similarities[:,-10:])
    #    for k in range(j-i):
    #        translation[src[i+k]] = nn[k]
    #        translation5[src[i+k]] = nn5[k]
    #        translation10[src[i+k]] = nn10[k]

    #if args.geomm_embeddings_path is not None:
    #    delim=','
    #    os.makedirs(args.geomm_embeddings_path,exist_ok=True)

    #    translations_fname=os.path.join(args.geomm_embeddings_path,'translations.csv')
    #    with open(translations_fname,'w',encoding=args.encoding) as translations_file:
    #        for src_id in src:
    #            src_word = src_words[src_id]
    #            all_trg_words = [ trg_words[trg_id] for trg_id in src2trg[src_id] ]
    #            trgout_words = [ trg_words[j] for j in translation10[src_id] ]
    #            ss = list(nn10[src_id,:])
    #
    #            p1 = ':'.join(all_trg_words)
    #            p2 = delim.join( [ '{}{}{}'.format(w,delim,s) for w,s in zip(trgout_words,ss) ] )
    #            translations_file.write( '{s}{delim}{p1}{delim}{p2}\n'.format(s=src_word, delim=delim, p1=p1, p2=p2) )

    ### find translation  (and write to file if output requested)
    delim = ','
    translations_file = None
    if args.geomm_embeddings_path is not None:
        os.makedirs(args.geomm_embeddings_path, exist_ok=True)
        translations_fname = os.path.join(args.geomm_embeddings_path,
                                          'translations.csv')
        translations_file = open(translations_fname,
                                 'w',
                                 encoding=args.encoding)

    for i in range(0, len(src), BATCH_SIZE):
        j = min(i + BATCH_SIZE, len(src))
        similarities = xw[src[i:j]].dot(zw.T)
        similarities = np.transpose(
            np.transpose(2 * similarities) - nbrhood_x[src[i:j]]) - nbrhood_z
        nn = similarities.argmax(axis=1).tolist()
        similarities = np.argsort((similarities), axis=1)

        nn5 = (similarities[:, -5:])
        nn10 = (similarities[:, -10:])
        for k in range(j - i):
            translation[src[i + k]] = nn[k]
            translation5[src[i + k]] = nn5[k]
            translation10[src[i + k]] = nn10[k]

            if args.geomm_embeddings_path is not None:
                src_id = src[i + k]
                src_word = src_words[src_id]
                all_trg_words = [
                    trg_words[trg_id] for trg_id in src2trg[src_id]
                ]
                trgout_words = [trg_words[j] for j in translation10[src_id]]
                #ss = list(nn10[src_id,:])

                p1 = ':'.join(all_trg_words)
                p2 = ':'.join(trgout_words)
                #p2 = delim.join( [ '{}{}{}'.format(w,delim,s) for w,s in zip(trgout_words,ss) ] )
                translations_file.write('{s}{delim}{p1}{delim}{p2}\n'.format(
                    s=src_word, p1=p1, p2=p2, delim=delim))

    if args.geomm_embeddings_path is not None:
        translations_file.close()

    accuracy = np.mean([1 if translation[i] in src2trg[i] else 0 for i in src])
    mean = 0
    for i in src:
        for k in translation5[i]:
            if k in src2trg[i]:
                mean += 1
                break

    mean /= len(src)
    accuracy5 = mean

    mean = 0
    for i in src:
        for k in translation10[i]:
            if k in src2trg[i]:
                mean += 1
                break

    mean /= len(src)
    accuracy10 = mean
    print(
        'Coverage:{0:7.2%}  Accuracy:{1:7.2%}  Accuracy(Top 5):{2:7.2%}  Accuracy(Top 10):{3:7.2%}'
        .format(coverage, accuracy, accuracy5, accuracy10))
Exemplo n.º 7
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(
        description='Map the source embeddings into the target embedding space'
    )
    parser.add_argument('src_input', help='the input source embeddings')
    parser.add_argument('trg_input', help='the input target embeddings')
    parser.add_argument('src_output', help='the output source embeddings')
    parser.add_argument('trg_output', help='the output target embeddings')
    parser.add_argument(
        '--encoding',
        default='utf-8',
        help='the character encoding for input/output (defaults to utf-8)')
    mapping_group = parser.add_argument_group(
        'mapping arguments', 'Basic embedding mapping arguments (EMNLP 2016)')
    mapping_group.add_argument(
        '-d',
        '--dictionary',
        default=sys.stdin.fileno(),
        help='the training dictionary file (defaults to stdin)')
    mapping_group.add_argument(
        '--normalize',
        choices=['unit', 'center', 'unitdim', 'centeremb'],
        nargs='*',
        default=[],
        help='the normalization actions to perform in order')
    mapping_group.add_argument(
        '-c',
        '--orthogonal',
        dest='orthogonal',
        action='store_true',
        help='use orthogonal constrained mapping (default)')
    mapping_group.add_argument('-u',
                               '--unconstrained',
                               dest='orthogonal',
                               action='store_false',
                               help='use unconstrained mapping')
    parser.set_defaults(orthogonal=True)
    self_learning_group = parser.add_argument_group(
        'self-learning arguments',
        'Optional arguments for self-learning (ACL 2017)')
    self_learning_group.add_argument('--self_learning',
                                     action='store_true',
                                     help='enable self-learning')
    self_learning_group.add_argument(
        '--direction',
        choices=['forward', 'backward', 'union'],
        default='forward',
        help='the direction for dictionary induction (defaults to forward)')
    self_learning_group.add_argument(
        '--numerals',
        action='store_true',
        help=
        'use latin numerals (i.e. words matching [0-9]+) as the seed dictionary'
    )
    self_learning_group.add_argument(
        '--orthographic_ext',
        default=0,
        type=float,
        help=
        'augment embeddings with character n-gram counts; provide inverse scale constant as argument'
    )
    self_learning_group.add_argument(
        '--orthographic_ext_n',
        default=1,
        type=int,
        help='n for character n-grams in orthograhpic_ext option')
    self_learning_group.add_argument(
        '--orthographic_sim',
        default=0,
        type=float,
        help=
        'use edit distance when calculating similarity; provide inverse scale constant as argument'
    )
    self_learning_group.add_argument(
        '--orthographic_sim_k',
        default=1,
        type=int,
        help=
        'k to use for symmetric delete heuristic for limiting edit distance calculations'
    )
    self_learning_group.add_argument(
        '--threshold',
        default=0.000001,
        type=float,
        help='the convergence threshold (defaults to 0.000001)')
    self_learning_group.add_argument(
        '--validation',
        default=None,
        help='a dictionary file for validation at each iteration')
    self_learning_group.add_argument(
        '--log', help='write to a log file in tsv format at each iteration')
    self_learning_group.add_argument(
        '-v',
        '--verbose',
        action='store_true',
        help='write log information to stderr at each iteration')
    args = parser.parse_args()

    # Read input embeddings
    srcfile = open(args.src_input,
                   encoding=args.encoding,
                   errors='surrogateescape')
    trgfile = open(args.trg_input,
                   encoding=args.encoding,
                   errors='surrogateescape')
    src_words = None
    x = None
    trg_words = None
    z = None
    if args.orthographic_ext:
        (src_words, x), (trg_words,
                         z) = embeddings.orthoread(srcfile, trgfile,
                                                   args.orthographic_ext,
                                                   args.orthographic_ext_n)
    else:
        src_words, x = embeddings.read(srcfile)
        trg_words, z = embeddings.read(trgfile)

    # Build word to index map
    src_word2ind = {word: i for i, word in enumerate(src_words)}
    trg_word2ind = {word: i for i, word in enumerate(trg_words)}

    # Build training dictionary
    src_indices = []
    trg_indices = []
    if args.numerals:
        if args.dictionary != sys.stdin.fileno():
            print('WARNING: Using numerals instead of the training dictionary',
                  file=sys.stderr)
        numeral_regex = re.compile('^[0-9]+$')
        src_numerals = {
            word
            for word in src_words if numeral_regex.match(word) is not None
        }
        trg_numerals = {
            word
            for word in trg_words if numeral_regex.match(word) is not None
        }
        numerals = src_numerals.intersection(trg_numerals)
        for word in numerals:
            src_indices.append(src_word2ind[word])
            trg_indices.append(trg_word2ind[word])
    else:
        f = open(args.dictionary,
                 encoding=args.encoding,
                 errors='surrogateescape')
        for line in f:
            src, trg = line.split()
            try:
                src_ind = src_word2ind[src]
                trg_ind = trg_word2ind[trg]
                src_indices.append(src_ind)
                trg_indices.append(trg_ind)
            except KeyError:
                print('WARNING: OOV dictionary entry ({0} - {1})'.format(
                    src, trg),
                      file=sys.stderr)

    # Read validation dictionary
    if args.validation is not None:
        f = open(args.validation,
                 encoding=args.encoding,
                 errors='surrogateescape')
        validation = collections.defaultdict(set)
        oov = set()
        vocab = set()
        for line in f:
            src, trg = line.split()
            try:
                src_ind = src_word2ind[src]
                trg_ind = trg_word2ind[trg]
                validation[src_ind].add(trg_ind)
                vocab.add(src)
            except KeyError:
                pass
                oov.add(src)
        oov -= vocab  # If one of the translation options is in the vocabulary, then the entry is not an oov
        validation_coverage = len(validation) / (len(validation) + len(oov))

    # Create log file
    if args.log:
        log = open(args.log,
                   mode='w',
                   encoding=args.encoding,
                   errors='surrogateescape')

    # Normalize embeddings
    for action in args.normalize:
        if action == 'unit':
            x = embeddings.length_normalize(x)
            z = embeddings.length_normalize(z)
        elif action == 'center':
            x = embeddings.mean_center(x)
            z = embeddings.mean_center(z)
        elif action == 'unitdim':
            x = embeddings.length_normalize_dimensionwise(x)
            z = embeddings.length_normalize_dimensionwise(z)
        elif action == 'centeremb':
            x = embeddings.mean_center_embeddingwise(x)
            z = embeddings.mean_center_embeddingwise(z)

    # Training loop
    prev_objective = objective = -100.
    it = 1
    ortho_sim = None
    ortho_sim_scale = args.orthographic_sim
    if args.orthographic_sim:
        s = ntpath.basename(args.src_input)[0:2]
        t = ntpath.basename(args.trg_input)[0:2]
        k = args.orthographic_sim_k
        ortho_sim = ortho.loadOrCreateSimilarityMatrix(s, t, k)

    t = time.time()
    while it == 1 or objective - prev_objective >= args.threshold:

        # Update the embedding mapping
        if args.orthogonal:  # orthogonal mapping
            u, s, vt = np.linalg.svd(np.dot(z[trg_indices].T, x[src_indices]))
            w = np.dot(vt.T, u.T)
        else:  # unconstrained mapping
            x_pseudoinv = np.dot(
                np.linalg.inv(np.dot(x[src_indices].T, x[src_indices])),
                x[src_indices].T)
            w = np.dot(x_pseudoinv, z[trg_indices])
        xw = x.dot(w)

        # Self-learning
        if args.self_learning:

            # Update the training dictionary
            best_sim_forward = np.full(x.shape[0], -100.)
            src_indices_forward = range(x.shape[0])
            trg_indices_forward = np.zeros(x.shape[0], dtype=int)
            best_sim_backward = np.full(z.shape[0], -100.)
            src_indices_backward = np.zeros(z.shape[0], dtype=int)
            trg_indices_backward = range(z.shape[0])
            for i in range(0, x.shape[0], MAX_DIM_X):
                for j in range(0, z.shape[0], MAX_DIM_Z):
                    sim = xw[i:i + MAX_DIM_X].dot(z[j:j + MAX_DIM_Z].T)
                    if args.orthographic_sim:
                        sim += (ortho_sim[i:i + MAX_DIM_X, j:j + MAX_DIM_Z].
                                toarray()) / ortho_sim_scale

                    for k in range(sim.shape[0]):
                        l = sim[k].argmax()
                        if sim[k, l] > best_sim_forward[i + k]:
                            best_sim_forward[i + k] = sim[k, l]
                            trg_indices_forward[i + k] = j + l
                    if args.direction in (
                            'backward', 'union'):  # Slow, only do if necessary
                        for l in range(sim.shape[1]):
                            k = sim[:, l].argmax()
                            if sim[k, l] > best_sim_backward[j + l]:
                                best_sim_backward[j + l] = sim[k, l]
                                src_indices_backward[j + l] = i + k
                    sim = None
            if args.direction == 'forward':
                src_indices = src_indices_forward
                trg_indices = trg_indices_forward
            elif args.direction == 'backward':
                src_indices = src_indices_backward
                trg_indices = trg_indices_backward
            elif args.direction == 'union':
                src_indices = np.concatenate(
                    (src_indices_forward, src_indices_backward))
                trg_indices = np.concatenate(
                    (trg_indices_forward, trg_indices_backward))

            # Objective function evaluation
            prev_objective = objective
            if args.direction == 'forward':
                objective = np.mean(best_sim_forward)
            elif args.direction == 'backward':
                objective = np.mean(best_sim_backward)
            elif args.direction == 'union':
                objective = (np.mean(best_sim_forward) +
                             np.mean(best_sim_backward)) / 2

            # Accuracy and similarity evaluation in validation
            if args.validation is not None:
                accuracy = np.mean([
                    1 if trg_indices_forward[src] in trg else 0
                    for src, trg in validation.items()
                ])
                similarity = np.mean([
                    np.max(z[list(trg)].dot(xw[src]))
                    for src, trg in validation.items()
                ])

            # Logging
            duration = time.time() - t
            if args.verbose:
                print(file=sys.stderr)
                print('ITERATION {0} ({1:.2f}s)'.format(it, duration),
                      file=sys.stderr)
                print('\t- Objective:        {0:9.4f}%'.format(100 *
                                                               objective),
                      file=sys.stderr)
                if args.validation is not None:
                    print('\t- Val. similarity:  {0:9.4f}%'.format(100 *
                                                                   similarity),
                          file=sys.stderr)
                    print('\t- Val. accuracy:    {0:9.4f}%'.format(100 *
                                                                   accuracy),
                          file=sys.stderr)
                    print('\t- Val. coverage:    {0:9.4f}%'.format(
                        100 * validation_coverage),
                          file=sys.stderr)
                sys.stderr.flush()
            if args.log is not None:
                val = '{0:.6f}\t{1:.6f}\t{2:.6f}'.format(
                    100 * similarity, 100 * accuracy, 100 *
                    validation_coverage) if args.validation is not None else ''
                print('{0}\t{1:.6f}\t{2}\t{3:.6f}'.format(
                    it, 100 * objective, val, duration),
                      file=log)
                log.flush()

        t = time.time()
        it += 1

    # Write mapped embeddings
    srcfile = open(args.src_output,
                   mode='w',
                   encoding=args.encoding,
                   errors='surrogateescape')
    trgfile = open(args.trg_output,
                   mode='w',
                   encoding=args.encoding,
                   errors='surrogateescape')
    embeddings.write(src_words, xw, srcfile)
    embeddings.write(trg_words, z, trgfile)
    srcfile.close()
    trgfile.close()
Exemplo n.º 8
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(
        description='Map the source embeddings into the target embedding space'
    )
    parser.add_argument('src_input', help='the input source embeddings')
    parser.add_argument('trg_input', help='the input target embeddings')
    parser.add_argument('src_output', help='the output source embeddings')
    parser.add_argument('trg_output', help='the output target embeddings')
    parser.add_argument(
        '--encoding',
        default='utf-8',
        help='the character encoding for input/output (defaults to utf-8)')
    parser.add_argument('--precision',
                        choices=['fp16', 'fp32', 'fp64'],
                        default='fp64',
                        help='the floating-point precision (defaults to fp64)')
    parser.add_argument('--cuda',
                        action='store_true',
                        help='use cuda (requires cupy)')
    mapping_group = parser.add_argument_group(
        'mapping arguments', 'Basic embedding mapping arguments (EMNLP 2016)')
    mapping_group.add_argument(
        '-d',
        '--dictionary',
        default=sys.stdin.fileno(),
        help='the training dictionary file (defaults to stdin)')
    mapping_group.add_argument(
        '--normalize',
        choices=['unit', 'center', 'unitdim', 'centeremb'],
        nargs='*',
        default=[],
        help='the normalization actions to perform in order')
    mapping_group.add_argument(
        '-c',
        '--orthogonal',
        dest='orthogonal',
        action='store_true',
        help='use orthogonal constrained mapping (default)')
    mapping_group.add_argument('-u',
                               '--unconstrained',
                               dest='orthogonal',
                               action='store_false',
                               help='use unconstrained mapping')
    parser.set_defaults(orthogonal=True)
    self_learning_group = parser.add_argument_group(
        'self-learning arguments',
        'Optional arguments for self-learning (ACL 2017)')
    self_learning_group.add_argument('--self_learning',
                                     action='store_true',
                                     help='enable self-learning')
    self_learning_group.add_argument(
        '--direction',
        choices=['forward', 'backward', 'union'],
        default='forward',
        help='the direction for dictionary induction (defaults to forward)')
    self_learning_group.add_argument(
        '--numerals',
        action='store_true',
        help=
        'use latin numerals (i.e. words matching [0-9]+) as the seed dictionary'
    )
    self_learning_group.add_argument(
        '--threshold',
        default=0.000001,
        type=float,
        help='the convergence threshold (defaults to 0.000001)')
    self_learning_group.add_argument(
        '--validation',
        default=None,
        help='a dictionary file for validation at each iteration')
    self_learning_group.add_argument(
        '--log', help='write to a log file in tsv format at each iteration')
    self_learning_group.add_argument(
        '-v',
        '--verbose',
        action='store_true',
        help='write log information to stderr at each iteration')
    args = parser.parse_args()

    # Choose the right dtype for the desired precision
    if args.precision == 'fp16':
        dtype = 'float16'
    elif args.precision == 'fp32':
        dtype = 'float32'
    elif args.precision == 'fp64':
        dtype = 'float64'

    # Read input embeddings
    srcfile = open(args.src_input,
                   encoding=args.encoding,
                   errors='surrogateescape')
    trgfile = open(args.trg_input,
                   encoding=args.encoding,
                   errors='surrogateescape')
    src_words, x = embeddings.read(srcfile, dtype=dtype)
    trg_words, z = embeddings.read(trgfile, dtype=dtype)

    # NumPy/CuPy management
    if args.cuda:
        if not supports_cupy():
            print('ERROR: Install CuPy for CUDA support', file=sys.stderr)
            sys.exit(-1)
        xp = get_cupy()
        x = xp.asarray(x)
        z = xp.asarray(z)
    else:
        xp = np

    # Build word to index map
    src_word2ind = {word: i for i, word in enumerate(src_words)}
    trg_word2ind = {word: i for i, word in enumerate(trg_words)}

    # Build training dictionary
    src_indices = []
    trg_indices = []
    if args.numerals:
        if args.dictionary != sys.stdin.fileno():
            print('WARNING: Using numerals instead of the training dictionary',
                  file=sys.stderr)
        numeral_regex = re.compile('^[0-9]+$')
        src_numerals = {
            word
            for word in src_words if numeral_regex.match(word) is not None
        }
        trg_numerals = {
            word
            for word in trg_words if numeral_regex.match(word) is not None
        }
        numerals = src_numerals.intersection(trg_numerals)
        for word in numerals:
            src_indices.append(src_word2ind[word])
            trg_indices.append(trg_word2ind[word])
    else:
        f = open(args.dictionary,
                 encoding=args.encoding,
                 errors='surrogateescape')
        for line in f:
            src, trg = line.split()
            try:
                src_ind = src_word2ind[src]
                trg_ind = trg_word2ind[trg]
                src_indices.append(src_ind)
                trg_indices.append(trg_ind)
            except KeyError:
                print('WARNING: OOV dictionary entry ({0} - {1})'.format(
                    src, trg),
                      file=sys.stderr)

    # Read validation dictionary
    if args.validation is not None:
        f = open(args.validation,
                 encoding=args.encoding,
                 errors='surrogateescape')
        validation = collections.defaultdict(set)
        oov = set()
        vocab = set()
        for line in f:
            src, trg = line.split()
            try:
                src_ind = src_word2ind[src]
                trg_ind = trg_word2ind[trg]
                validation[src_ind].add(trg_ind)
                vocab.add(src)
            except KeyError:
                oov.add(src)
        oov -= vocab  # If one of the translation options is in the vocabulary, then the entry is not an oov
        validation_coverage = len(validation) / (len(validation) + len(oov))

    # Create log file
    if args.log:
        log = open(args.log,
                   mode='w',
                   encoding=args.encoding,
                   errors='surrogateescape')

    # Normalize embeddings
    for action in args.normalize:
        if action == 'unit':
            x = embeddings.length_normalize(x)
            z = embeddings.length_normalize(z)
        elif action == 'center':
            x = embeddings.mean_center(x)
            z = embeddings.mean_center(z)
        elif action == 'unitdim':
            x = embeddings.length_normalize_dimensionwise(x)
            z = embeddings.length_normalize_dimensionwise(z)
        elif action == 'centeremb':
            x = embeddings.mean_center_embeddingwise(x)
            z = embeddings.mean_center_embeddingwise(z)

    # Training loop
    prev_objective = objective = -100.
    it = 1
    t = time.time()
    while it == 1 or objective - prev_objective >= args.threshold:

        # Update the embedding mapping
        if args.orthogonal:  # orthogonal mapping
            u, s, vt = xp.linalg.svd(z[trg_indices].T.dot(x[src_indices]))
            w = vt.T.dot(u.T)
        else:  # unconstrained mapping
            x_pseudoinv = xp.linalg.inv(x[src_indices].T.dot(
                x[src_indices])).dot(x[src_indices].T)
            w = x_pseudoinv.dot(z[trg_indices])
        xw = x.dot(w)

        # Self-learning
        if args.self_learning:

            # Update the training dictionary
            best_sim_forward = xp.full(x.shape[0], -100, dtype=dtype)
            src_indices_forward = xp.arange(x.shape[0])
            trg_indices_forward = xp.zeros(x.shape[0], dtype=int)
            best_sim_backward = xp.full(z.shape[0], -100, dtype=dtype)
            src_indices_backward = xp.zeros(z.shape[0], dtype=int)
            trg_indices_backward = xp.arange(z.shape[0])
            for i in range(0, x.shape[0], MAX_DIM_X):
                j = min(x.shape[0], i + MAX_DIM_X)
                for k in range(0, z.shape[0], MAX_DIM_Z):
                    l = min(z.shape[0], k + MAX_DIM_Z)
                    sim = xw[i:j].dot(z[k:l].T)
                    if args.direction in ('forward', 'union'):
                        ind = sim.argmax(axis=1)
                        val = sim[xp.arange(sim.shape[0]), ind]
                        ind += k
                        mask = (val > best_sim_forward[i:j])
                        best_sim_forward[i:j][mask] = val[mask]
                        trg_indices_forward[i:j][mask] = ind[mask]
                    if args.direction in ('backward', 'union'):
                        ind = sim.argmax(axis=0)
                        val = sim[ind, xp.arange(sim.shape[1])]
                        ind += i
                        mask = (val > best_sim_backward[k:l])
                        best_sim_backward[k:l][mask] = val[mask]
                        src_indices_backward[k:l][mask] = ind[mask]
            if args.direction == 'forward':
                src_indices = src_indices_forward
                trg_indices = trg_indices_forward
            elif args.direction == 'backward':
                src_indices = src_indices_backward
                trg_indices = trg_indices_backward
            elif args.direction == 'union':
                src_indices = xp.concatenate(
                    (src_indices_forward, src_indices_backward))
                trg_indices = xp.concatenate(
                    (trg_indices_forward, trg_indices_backward))

            # Objective function evaluation
            prev_objective = objective
            if args.direction == 'forward':
                objective = xp.mean(best_sim_forward).tolist()
            elif args.direction == 'backward':
                objective = xp.mean(best_sim_backward).tolist()
            elif args.direction == 'union':
                objective = (xp.mean(best_sim_forward) +
                             xp.mean(best_sim_backward)).tolist() / 2

            # Accuracy and similarity evaluation in validation
            if args.validation is not None:
                src = list(validation.keys())
                sim = xw[src].dot(z.T)  # TODO Assuming that it fits in memory
                nn = asnumpy(sim.argmax(axis=1))
                accuracy = np.mean([
                    1 if nn[i] in validation[src[i]] else 0
                    for i in range(len(src))
                ])
                similarity = np.mean([
                    max([sim[i, j].tolist() for j in validation[src[i]]])
                    for i in range(len(src))
                ])

            # Logging
            duration = time.time() - t
            if args.verbose:
                print(file=sys.stderr)
                print('ITERATION {0} ({1:.2f}s)'.format(it, duration),
                      file=sys.stderr)
                print('\t- Objective:        {0:9.4f}%'.format(100 *
                                                               objective),
                      file=sys.stderr)
                if args.validation is not None:
                    print('\t- Val. similarity:  {0:9.4f}%'.format(100 *
                                                                   similarity),
                          file=sys.stderr)
                    print('\t- Val. accuracy:    {0:9.4f}%'.format(100 *
                                                                   accuracy),
                          file=sys.stderr)
                    print('\t- Val. coverage:    {0:9.4f}%'.format(
                        100 * validation_coverage),
                          file=sys.stderr)
                sys.stderr.flush()
            if args.log is not None:
                val = '{0:.6f}\t{1:.6f}\t{2:.6f}'.format(
                    100 * similarity, 100 * accuracy, 100 *
                    validation_coverage) if args.validation is not None else ''
                print('{0}\t{1:.6f}\t{2}\t{3:.6f}'.format(
                    it, 100 * objective, val, duration),
                      file=log)
                log.flush()

        t = time.time()
        it += 1

    # Write mapped embeddings
    srcfile = open(args.src_output,
                   mode='w',
                   encoding=args.encoding,
                   errors='surrogateescape')
    trgfile = open(args.trg_output,
                   mode='w',
                   encoding=args.encoding,
                   errors='surrogateescape')
    embeddings.write(src_words, xw, srcfile)
    embeddings.write(trg_words, z, trgfile)
    srcfile.close()
    trgfile.close()
Exemplo n.º 9
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(
        description=
        'Evaluate embeddings of two languages in a shared space in word translation induction'
    )
    parser.add_argument('src_embeddings',
                        help='the source language embeddings')
    parser.add_argument('trg_embeddings',
                        help='the target language embeddings')
    parser.add_argument('-d',
                        '--dictionary',
                        default=sys.stdin.fileno(),
                        help='the test dictionary file (defaults to stdin)')
    parser.add_argument(
        '--encoding',
        default='utf-8',
        action='store_true',
        help='the character encoding for input/output (defaults to utf-8)')
    args = parser.parse_args()

    # Read input embeddings
    srcfile = open(args.src_embeddings,
                   encoding=args.encoding,
                   errors='surrogateescape')
    trgfile = open(args.trg_embeddings,
                   encoding=args.encoding,
                   errors='surrogateescape')
    src_words, src_matrix = embeddings.read(srcfile)
    trg_words, trg_matrix = embeddings.read(trgfile)

    # Length normalize embeddings so their dot product effectively computes the cosine similarity
    src_matrix = embeddings.length_normalize(src_matrix)
    trg_matrix = embeddings.length_normalize(trg_matrix)

    # Build word to index map
    src_word2ind = {word: i for i, word in enumerate(src_words)}
    trg_word2ind = {word: i for i, word in enumerate(trg_words)}

    # Read dictionary and compute coverage
    f = open(args.dictionary, encoding=args.encoding, errors='surrogateescape')
    src2trg = collections.defaultdict(set)
    oov = set()
    vocab = set()
    for line in f:
        src, trg = line.split()
        try:
            src_ind = src_word2ind[src]
            trg_ind = trg_word2ind[trg]
            src2trg[src_ind].add(trg_ind)
            vocab.add(src)
        except KeyError:
            oov.add(src)
    oov -= vocab  # If one of the translation options is in the vocabulary, then the entry is not an oov
    coverage = len(src2trg) / (len(src2trg) + len(oov))

    # Compute accuracy
    correct = 0
    for src, trg in src2trg.items():
        similarities = np.dot(trg_matrix, src_matrix[src])
        closest = np.argmax(similarities)
        if closest in trg:
            correct += 1
    print('Coverage:{0:7.2%}  Accuracy:{1:7.2%}'.format(
        coverage, correct / len(src2trg)))
Exemplo n.º 10
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(
        description='Map word embeddings in two languages into a shared space')
    parser.add_argument('src_input', help='the input source embeddings')
    parser.add_argument('trg_input', help='the input target embeddings')
    parser.add_argument('sense_input', help='the input sense mapping matrix')
    parser.add_argument('src_output', help='the output source embeddings')
    parser.add_argument('trg_output', help='the output target embeddings')
    parser.add_argument('tsns_output',
                        default='tsns.pkl',
                        help='the output target senses pickle file')
    parser.add_argument(
        '--encoding',
        default='utf-8',
        help='the character encoding for input/output (defaults to utf-8)')
    parser.add_argument('--precision',
                        choices=['fp16', 'fp32', 'fp64'],
                        default='fp32',
                        help='the floating-point precision (defaults to fp32)')
    parser.add_argument('--cuda',
                        action='store_true',
                        help='use cuda (requires cupy)')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='the random seed (defaults to 0)')

    recommended_group = parser.add_argument_group(
        'recommended settings', 'Recommended settings for different scenarios')
    recommended_type = recommended_group.add_mutually_exclusive_group()
    recommended_type.add_argument(
        '--unsupervised',
        action='store_true',
        help=
        'recommended if you have no seed dictionary and do not want to rely on identical words'
    )
    recommended_type.add_argument('--future',
                                  action='store_true',
                                  help='experiment with stuff')
    recommended_type.add_argument('--toy',
                                  action='store_true',
                                  help='experiment with stuff on toy dataset')
    recommended_type.add_argument('--acl2018',
                                  action='store_true',
                                  help='reproduce our ACL 2018 system')

    init_group = parser.add_argument_group(
        'advanced initialization arguments',
        'Advanced initialization arguments')
    init_type = init_group.add_mutually_exclusive_group()
    init_type.add_argument('--init_unsupervised',
                           action='store_true',
                           help='use unsupervised initialization')
    init_group.add_argument(
        '--unsupervised_vocab',
        type=int,
        default=0,
        help=
        'restrict the vocabulary to the top k entries for unsupervised initialization'
    )

    mapping_group = parser.add_argument_group(
        'advanced mapping arguments', 'Advanced embedding mapping arguments')
    mapping_group.add_argument(
        '--normalize',
        choices=['unit', 'center', 'unitdim', 'centeremb', 'none'],
        nargs='*',
        default=[],
        help='the normalization actions to perform in order')
    mapping_group.add_argument('--whiten',
                               action='store_true',
                               help='whiten the embeddings')
    mapping_group.add_argument('--src_reweight',
                               type=float,
                               default=0,
                               nargs='?',
                               const=1,
                               help='re-weight the source language embeddings')
    mapping_group.add_argument('--trg_reweight',
                               type=float,
                               default=0,
                               nargs='?',
                               const=1,
                               help='re-weight the target language embeddings')
    mapping_group.add_argument('--src_dewhiten',
                               choices=['src', 'trg'],
                               help='de-whiten the source language embeddings')
    mapping_group.add_argument('--trg_dewhiten',
                               choices=['src', 'trg'],
                               help='de-whiten the target language embeddings')
    mapping_group.add_argument('--dim_reduction',
                               type=int,
                               default=0,
                               help='apply dimensionality reduction')
    mapping_type = mapping_group.add_mutually_exclusive_group()
    mapping_type.add_argument('-c',
                              '--orthogonal',
                              action='store_true',
                              help='use orthogonal constrained mapping')

    self_learning_group = parser.add_argument_group(
        'advanced self-learning arguments',
        'Advanced arguments for self-learning')
    self_learning_group.add_argument(
        '--vocabulary_cutoff',
        type=int,
        default=0,
        help='restrict the vocabulary to the top k entries')
    self_learning_group.add_argument(
        '--threshold',
        default=0.000001,
        type=float,
        help='the convergence threshold (defaults to 0.000001)')
    self_learning_group.add_argument(
        '--stochastic_initial',
        default=0.1,
        type=float,
        help=
        'initial keep probability stochastic dictionary induction (defaults to 0.1)'
    )
    self_learning_group.add_argument(
        '--stochastic_multiplier',
        default=2.0,
        type=float,
        help='stochastic dictionary induction multiplier (defaults to 2.0)')
    self_learning_group.add_argument(
        '--stochastic_interval',
        default=50,
        type=int,
        help='stochastic dictionary induction interval (defaults to 50)')
    self_learning_group.add_argument(
        '--log',
        default='map.log',
        help='write to a log file in tsv format at each iteration')
    self_learning_group.add_argument(
        '-v',
        '--verbose',
        action='store_true',
        help='write log information to stderr at each iteration')

    future_group = parser.add_argument_group('experimental arguments',
                                             'Experimental arguments')
    future_group.add_argument('--skip_top',
                              type=int,
                              default=0,
                              help='Top k words to skip, presumably function')
    future_group.add_argument(
        '--start_src',
        action='store_true',
        help='Algorithm starts by tuning sense embeddings based on source')
    future_group.add_argument('--trim_senses',
                              action='store_true',
                              help='Trim sense table to working vocab')
    future_group.add_argument(
        '--lamb',
        type=float,
        default=0.5,
        help='Weight hyperparameter for sense alignment objectives')
    future_group.add_argument('--reglamb',
                              type=float,
                              default=1.,
                              help='Lasso regularization hyperparameter')
    future_group.add_argument(
        '--ccreglamb',
        type=float,
        default=0.1,
        help='Sense embedding regularization hyperparameter')
    future_group.add_argument('--inv_delta',
                              type=float,
                              default=0.0001,
                              help='Delta_I added for inverting sense matrix')
    future_group.add_argument('--lasso_iters',
                              type=int,
                              default=10,
                              help='Number of iterations for LASSO/NMF')
    future_group.add_argument('--iterations',
                              type=int,
                              default=-1,
                              help='Number of overall model iterations')
    future_group.add_argument('--trg_batch',
                              type=int,
                              default=5000,
                              help='Batch size for target steps')
    future_group.add_argument(
        '--trg_knn',
        action='store_true',
        help='Perform target sense mapping by k-nearest neighbors')
    future_group.add_argument(
        '--trg_sns_csls',
        type=int,
        default=10,
        help='K-nearest neighbors for CSLS target sense search')
    future_group.add_argument(
        '--senses_per_trg',
        type=int,
        default=1,
        help='K-max target sense mapping (default = 1 = off)')
    future_group.add_argument(
        '--gd',
        action='store_true',
        help='Apply gradient descent for assignment and synset embeddings')
    future_group.add_argument('--gd_lr',
                              type=float,
                              default=1e-2,
                              help='Learning rate for SGD (default=0.01)')
    future_group.add_argument('--gd_wd',
                              action='store_true',
                              help='Weight decay in SGD')
    future_group.add_argument(
        '--gd_wd_hl',
        type=int,
        default=100,
        help='Weight decay half-life in SGD, default=100')
    future_group.add_argument(
        '--gd_clip',
        type=float,
        default=5.,
        help='Per-coordinate gradient clipping (default=5)')
    future_group.add_argument(
        '--gd_map_steps',
        type=int,
        default=1,
        help='Consecutive steps for each target-sense mapping update phase')
    future_group.add_argument(
        '--gd_emb_steps',
        type=int,
        default=1,
        help='Consecutive steps for each sense embedding update phase')
    future_group.add_argument(
        '--base_prox_lambda',
        type=float,
        default=0.99,
        help='Lambda for proximal gradient in lasso step')
    future_group.add_argument(
        '--prox_decay',
        action='store_true',
        help='Multiply proximal lambda by itself each iteration')
    future_group.add_argument(
        '--sense_limit',
        type=float,
        default=1.1,
        help=
        'Maximum amount of target sense mappings, in terms of source mappings (default=1.1x)'
    )
    future_group.add_argument(
        '--gold_pairs',
        help='Gold data for evaluation, if exists (not for tuning)')
    future_group.add_argument(
        '--gold_threshold',
        type=float,
        default=0.0,
        help='Threshold for gold mapping (0 is fine if sparse)')

    future_group.add_argument('--debug', action='store_true')

    args = parser.parse_args()

    # pre-setting groups
    if args.toy:
        parser.set_defaults(init_unsupervised=True,
                            unsupervised_vocab=4000,
                            normalize=['unit', 'center', 'unit'],
                            whiten=True,
                            src_reweight=0.5,
                            trg_reweight=0.5,
                            src_dewhiten='src',
                            trg_dewhiten='trg',
                            vocabulary_cutoff=50,
                            trim_senses=True,
                            inv_delta=1.,
                            reglamb=0.2,
                            lasso_iters=100,
                            gd_wd=True,
                            log='map-toy.log')
    if args.unsupervised or args.future:
        parser.set_defaults(init_unsupervised=True,
                            unsupervised_vocab=4000,
                            normalize=['unit', 'center', 'unit'],
                            whiten=True,
                            src_reweight=0.5,
                            trg_reweight=0.5,
                            src_dewhiten='src',
                            trg_dewhiten='trg',
                            vocabulary_cutoff=2000,
                            trim_senses=True,
                            gd_wd=True)
    if args.unsupervised or args.acl2018:
        parser.set_defaults(init_unsupervised=True,
                            unsupervised_vocab=4000,
                            normalize=['unit', 'center', 'unit'],
                            whiten=True,
                            src_reweight=0.5,
                            trg_reweight=0.5,
                            src_dewhiten='src',
                            trg_dewhiten='trg',
                            vocabulary_cutoff=20000)
    args = parser.parse_args()

    # Check command line arguments
    if (args.src_dewhiten is not None
            or args.trg_dewhiten is not None) and not args.whiten:
        print('ERROR: De-whitening requires whitening first', file=sys.stderr)
        sys.exit(-1)

    # Choose the right dtype for the desired precision
    if args.precision == 'fp16':
        dtype = 'float16'  # many operations not supported by cupy
    elif args.precision == 'fp32':  # default
        dtype = 'float32'
    elif args.precision == 'fp64':
        dtype = 'float64'

    # Read input embeddings
    print('reading embeddings...')
    srcfile = open(args.src_input,
                   encoding=args.encoding,
                   errors='surrogateescape')
    trgfile = open(args.trg_input,
                   encoding=args.encoding,
                   errors='surrogateescape')
    src_words, x = embeddings.read(srcfile, dtype=dtype)
    trg_words, z = embeddings.read(trgfile, dtype=dtype)
    print('embeddings read')

    # Read input source sense mapping
    print('reading sense mapping')
    src_senses = pickle.load(open(args.sense_input, 'rb'))
    if src_senses.shape[0] != x.shape[0]:
        src_senses = csr_matrix(src_senses.transpose()
                                )  # using non-cuda scipy because of 'inv' impl
    #src_senses = get_sparse_module(src_senses)
    print(
        f'source sense mapping of shape {src_senses.shape} loaded with {src_senses.getnnz()} nonzeros'
    )

    # NumPy/CuPy management
    if args.cuda:
        if not supports_cupy():
            print('ERROR: Install CuPy for CUDA support', file=sys.stderr)
            sys.exit(-1)
        xp = get_cupy()
        x = xp.asarray(x)
        z = xp.asarray(z)
        print('CUDA loaded')
    else:
        xp = np
    xp.random.seed(args.seed)

    # removed word to index map (only relevant in supervised learning or with validation)

    # STEP 0: Normalization
    embeddings.normalize(x, args.normalize)
    embeddings.normalize(z, args.normalize)
    print('normalization complete')

    # removed building the seed dictionary

    # removed validation step

    # Create log file
    if args.log:
        log = open(args.log,
                   mode='w',
                   encoding=args.encoding,
                   errors='surrogateescape')
        print(f'logging into {args.log}')

    # Allocate memory

    # Initialize the projection matrices W(s) = W(t) = I.
    xw = xp.empty_like(x)
    zw = xp.empty_like(z)
    xw[:] = x
    zw[:] = z

    src_size = x.shape[0] if args.vocabulary_cutoff <= 0 else min(
        x.shape[0] - args.skip_top, args.vocabulary_cutoff)
    trg_size = z.shape[0] if args.vocabulary_cutoff <= 0 else min(
        z.shape[0] - args.skip_top, args.vocabulary_cutoff)
    emb_dim = x.shape[1]

    cutoff_end = min(src_size + args.skip_top, x.shape[0])

    if args.trim_senses:
        # reshape sense assignment
        src_senses = src_senses[args.skip_top:cutoff_end]

        # new columns for words with no senses in original input
        ### TODO might also need this if not trimming (probably kinda far away)
        newcols = [csc_matrix(([1],([i],[0])),shape=(src_size,1)) for i in range(src_size)\
                   if src_senses.getrow(i).getnnz() == 0]
        #with open(f'data/synsets/dummy_synsets_v3b_{src_size}','wb') as dummy_cols_file:
        #    dummy_col_idcs = [i for i in range(src_size) if src_senses.getrow(i).getnnz() == 0]
        #    pickle.dump(np.array(dummy_col_idcs), dummy_cols_file)

        # trim senses no longer used, add new ones
        colsums = src_senses.sum(axis=0).tolist()[0]
        kept_senses = [i for i, j in enumerate(colsums) if j > 0]
        #with open(f'data/synsets/kept_synsets_v3b_{src_size}','wb') as kept_save_file:
        #    pickle.dump(np.array(kept_senses), kept_save_file)
        src_senses = hstack([src_senses[:, kept_senses]] + newcols)
        print(
            f'trimmed sense dictionary dimensions: {src_senses.shape} with {src_senses.getnnz()} nonzeros'
        )
    sense_size = src_senses.shape[1]

    if args.gold_pairs is not None:
        with open(args.gold_pairs, 'rb') as gold_pairs_f:
            gold_pairs = pickle.load(gold_pairs_f)
            gold_pairs = [(i-args.skip_top,j) for i,j in gold_pairs \
                          if i >= args.skip_top and i < src_senses.shape[0] and j < src_senses.shape[1]]
        gold_trgs = sorted(set([x[0] for x in gold_pairs]))
        gold_senses = sorted(set([x[1] for x in gold_pairs]))
        gold_domain_size = len(gold_trgs) * len(gold_senses)
        print(
            f'evaluating on {len(gold_pairs)} pairs with {len(gold_trgs)} unique words and {len(gold_senses)} unique senses'
        )

    # Initialize the concept embeddings from the source embeddings
    ### TODO maybe try gradient descent instead?
    ### TODO (pre-)create non-singular alignment matrix
    cc = xp.empty((sense_size, emb_dim), dtype=dtype)  # \tilde{E}
    t01 = time.time()
    print('starting psinv calc')
    src_sns_psinv = psinv(src_senses, dtype, args.inv_delta)
    xecc = x[args.skip_top:cutoff_end].T.dot(
        get_sparse_module(src_senses).toarray()).T  # sense_size * emb_dim
    cc[:] = src_sns_psinv.dot(xecc)
    print(f'initialized concept embeddings in {time.time()-t01:.2f} seconds',
          file=sys.stderr)
    if args.verbose:
        # report precision of psedo-inverse operation, checked by inverting
        pseudo_id = src_senses.transpose().dot(src_senses).dot(
            src_sns_psinv.get())
        real_id = sparse_id(sense_size)
        rel_diff = (pseudo_id - real_id).sum() / (sense_size * sense_size)
        print(f'per-coordinate pseudo-inverse precision is {rel_diff:.5f}')

    ### TODO initialize trg_senses using seed dictionary instead?
    trg_sns_size = trg_size if args.trim_senses else z.shape[0]
    trg_senses = csr_matrix(
        (trg_sns_size,
         sense_size))  # using non-cuda scipy because of 'inv' impl
    zecc = xp.empty_like(xecc)  # sense_size * emb_dim
    #tg_grad = xp.empty((trg_sns_size, sense_size))

    if args.gd:
        # everything can be done on gpu
        src_senses = get_sparse_module(src_senses, dtype=dtype)
        trg_senses = get_sparse_module(trg_senses, dtype=dtype)
        if args.sense_limit > 0.0:
            trg_sense_limit = int(args.sense_limit * src_senses.getnnz())
            if args.verbose:
                print(
                    f'limiting target side to {trg_sense_limit} sense mappings'
                )
        else:
            trg_sense_limit = -1

    ### TODO return memory assignment for similarities?

    # Training loop
    if args.gd:
        prox_lambda = args.base_prox_lambda
    else:
        lasso_model = Lasso(alpha=args.reglamb, fit_intercept=False, max_iter=args.lasso_iters,\
                            positive=True, warm_start=True)  # TODO more parametrization

    if args.log is not None:
        if args.gd:
            print(f'gradient descent lr: {args.gd_lr}', file=log)
            print(f'base proximal lambda: {args.base_prox_lambda}', file=log)
        else:
            print(f'lasso regularization: {args.reglamb}', file=log)
            print(f'lasso iterations: {args.lasso_iters}', file=log)
            print(f'inversion epsilon: {args.inv_delta}', file=log)
        if args.gold_pairs is not None:
            print(f'gold mappings: {len(gold_pairs)}', file=log)
        print(
            f'Iteration\tObjective\tSource\tTarget\tL_1\tDuration\tNonzeros\tCorrect_mappings',
            file=log)
        log.flush()

    best_objective = objective = 1000000000.
    correct_mappings = -1
    regularization_lambda = args.base_prox_lambda if args.gd else args.reglamb
    it = 1
    last_improvement = 0
    t = time.time()
    map_gd_lr = args.gd_lr
    emb_gd_lr = args.gd_lr
    end = False
    print('starting training')

    if args.start_src:
        print('starting with converging synset embeddings')
        it_range = range(
            args.iterations
        )  ### TODO possibly add arg, but there's early stopping
        if not args.verbose:
            it_range = tqdm(it_range)
        prev_obj = float('inf')
        for pre_it in it_range:
            if args.gd_wd:
                emb_gd_lr = args.gd_lr * pow(0.5, floor(
                    pre_it / args.gd_wd_hl))

            # Synset embedding
            cc_grad = src_senses.T.dot(
                xw[args.skip_top:cutoff_end] -
                src_senses.dot(cc)) - args.ccreglamb * cc
            cc_grad.clip(-args.gd_clip, args.gd_clip, out=cc_grad)
            cc += emb_gd_lr * cc_grad

            # Source projection
            u, s, vt = xp.linalg.svd(cc.T.dot(xecc))
            wx = vt.T.dot(u.T).astype(dtype)
            x.dot(wx, out=xw)

            pre_objective = ((xp.linalg.norm(
                xw[args.skip_top:cutoff_end] -
                get_sparse_module(src_senses).dot(cc), 'fro'))**2) / 2
            pre_objective = float(pre_objective)

            if args.verbose and pre_it > 0 and pre_it % 10 == 0:
                print(
                    f'source synset embedding objective iteration {pre_it}: {pre_objective:.3f}'
                )

            if pre_objective > prev_obj:
                print(
                    f'stopping at pre-iteration {pre_it}, source-sense objective {prev_obj:.3f}'
                )
                # revert
                cc -= emb_gd_lr * cc_grad
                break

            prev_obj = pre_objective

    while True:
        if it % 50 == 0:
            print(
                f'starting iteration {it}, last objective was {objective}, correct mappings at {correct_mappings}'
            )

        # Increase the keep probability if we have not improved in args.stochastic_interval iterations
        if it - last_improvement > args.stochastic_interval:
            last_improvement = it

        if args.iterations > 0 and it > args.iterations:
            end = True

        ### update target assignments (6) - lasso-esque regression
        time6 = time.time()
        # optimize: 0.5 * (xp.linalg.norm(zw[i] - trg_senses[i].dot(cc))^2) + (regularization_lambda * xp.linalg.norm(trg_senses[i],1))

        if args.trg_knn:
            # for csls-based neighborhoods
            knn_sense = xp.full(sense_size, -100)
            for i in range(0, sense_size, args.trg_batch):
                batch_end = min(i + args.trg_batch, sense_size)
                sim_sense_trg = cc[i:batch_end].dot(
                    zw[args.skip_top:cutoff_end].T)
                knn_sense[i:batch_end] = topk_mean(sim_sense_trg,
                                                   k=args.trg_sns_csls,
                                                   inplace=True)

            # calculate new target mappings
            trg_senses = lil_matrix(trg_senses.shape)
            for i in range(0, trg_size, args.trg_batch):
                sns_batch_end = min(i + args.trg_batch, trg_size)
                z_i = i + args.skip_top
                z_batch_end = min(sns_batch_end + args.skip_top, zw.shape[0])

                sims = zw[z_i:z_batch_end].dot(cc.T)
                sims -= knn_sense / 2  # equivalent to the real CSLS scores for NN
                best_idcs = sims.argmax(1).tolist()
                trg_senses[(list(range(i, sns_batch_end)),
                            best_idcs)] = sims.max(1).tolist()

                # second-to-lth-best
                for l in range(args.senses_per_trg - 1):
                    sims[(list(range(sims.shape[0])), best_idcs)] = 0.
                    best_idcs = sims.argmax(1).tolist()
                    trg_senses[(list(range(i, sns_batch_end)),
                                best_idcs)] = sims.max(1).tolist()

            trg_senses = get_sparse_module(trg_senses.tocsr())

        elif args.gd:
            ### TODO add args.skip_top calculations
            if args.gd_wd:
                true_it = (it - 1) * args.gd_map_steps
                map_gd_lr = args.gd_lr * pow(
                    0.5, floor((1 + true_it) / args.gd_wd_hl))
                if args.verbose:
                    print(f'mapping learning rate: {map_gd_lr}')

            for k in range(args.gd_map_steps):
                # st <- st + eta * (ew - st.dot(es)).dot(es.T)
                # allow up to sense_limit updates, clip gradient

                batch_grads = []
                for i in range(0, trg_size, args.trg_batch):
                    batch_end = min(i + args.trg_batch, trg_size)
                    tg_grad_b = (zw[i:batch_end] -
                                 trg_senses[i:batch_end].dot(cc)).dot(cc.T)

                    # proximal gradient
                    tg_grad_b += prox_lambda
                    tg_grad_b.clip(None, 0.0, out=tg_grad_b)
                    batch_grads.append(batch_sparse(tg_grad_b))

                tg_grad = get_sparse_module(vstack(batch_grads))
                del tg_grad_b

                if args.prox_decay:
                    prox_lambda *= args.base_prox_lambda

                ### TODO consider weight decay here as well (args.gd_wd)
                trg_senses -= map_gd_lr * tg_grad

                # allow up to sense_limit nonzeros
                if trg_sense_limit > 0:
                    trg_senses = trim_sparse(trg_senses,
                                             trg_sense_limit,
                                             clip=None)

            ### TODO consider finishing up with lasso (maybe only in final iteration)

        else:
            ### TODO add args.skip_top calculations
            # parallel LASSO (no cuda impl)
            cccpu = cc.get().T  # emb_dim * sense_size
            lasso_model.fit(cccpu, zw[:trg_size].get().T)
            ### TODO maybe trim, keep only above some threshold (0.05) OR top f(#it)
            trg_senses = lasso_model.sparse_coef_

        if args.verbose:
            print(
                f'target sense mapping step: {(time.time()-time6):.2f} seconds, {trg_senses.getnnz()} nonzeros',
                file=sys.stderr)
            objective = ((xp.linalg.norm(xw[args.skip_top:cutoff_end] - get_sparse_module(src_senses).dot(cc),'fro') ** 2)\
                            + (xp.linalg.norm(zw[args.skip_top:cutoff_end] - get_sparse_module(trg_senses).dot(cc),'fro')) ** 2) / 2 \
                        + regularization_lambda * trg_senses.sum()  # TODO consider thresholding reg part
            objective = float(objective)
            print(f'objective: {objective:.3f}')

        # Write target sense mapping
        with open(f'tmp_outs/{args.tsns_output[:-4]}-it{it:03d}.pkl',
                  mode='wb') as tsnsfile:
            pickle.dump(trg_senses.get(), tsnsfile)

        ### update synset embeddings (10)
        time10 = time.time()
        if args.gd and args.gd_emb_steps > 0:
            ### TODO probably handle sizes and/or threshold sparse matrix
            if args.gd_wd:
                true_it = (it - 1) * args.gd_emb_steps
                emb_gd_lr = args.gd_lr * pow(
                    0.5, floor((1 + true_it) / args.gd_wd_hl))
                if args.verbose:
                    print(f'embedding learning rate: {emb_gd_lr}')

            ### replace block for no-source-tuning mode
            all_senses = trg_senses if args.start_src else get_sparse_module(
                vstack((src_senses.get(), trg_senses.get()), format='csr'),
                dtype=dtype)
            aw = zw[args.
                    skip_top:cutoff_end] if args.start_src else xp.concatenate(
                        (xw[args.skip_top:cutoff_end],
                         zw[args.skip_top:cutoff_end]))

            for i in range(args.gd_emb_steps):
                cc_grad = all_senses.T.dot(
                    aw - all_senses.dot(cc)) - args.ccreglamb * cc
                cc_grad.clip(-args.gd_clip, args.gd_clip, out=cc_grad)
                cc += emb_gd_lr * cc_grad

        else:
            ### TODO add args.skip_top calculations
            all_senses = get_sparse_module(
                vstack((src_senses, trg_senses), format='csr'))
            xzecc = xp.concatenate((xw[:src_size], zw[:trg_size])).T\
                        .dot(all_senses.toarray()).T  # sense_size * emb_dim
            all_sns_psinv = psinv(
                all_senses.get(), dtype, args.inv_delta
            )  ### TODO only update target side? We still have src_sns_psinv [it doesn't matter, dimensions are the same]
            cc[:] = all_sns_psinv.dot(xzecc)

        if args.verbose:
            print(f'synset embedding update: {time.time()-time10:.2f}',
                  file=sys.stderr)
            objective = ((xp.linalg.norm(xw[args.skip_top:cutoff_end] - get_sparse_module(src_senses).dot(cc),'fro')) ** 2\
                            + (xp.linalg.norm(zw[args.skip_top:cutoff_end] - get_sparse_module(trg_senses).dot(cc),'fro')) ** 2) / 2 \
                        + regularization_lambda * trg_senses.sum()  # TODO consider thresholding reg part
            objective = float(objective)
            print(f'objective: {objective:.3f}')

        ### update projections (3,5)
        # write to zw and xw
        if args.orthogonal or not end:

            ### remove block for no-source-tuning mode
            # source side - mappings don't change so xecc is constant
            #if not args.start_src:  # need to do this anyway whenever cc updates
            time3 = time.time()
            u, s, vt = xp.linalg.svd(cc.T.dot(xecc))
            wx = vt.T.dot(u.T).astype(dtype)
            x.dot(wx, out=xw)
            if args.verbose:
                print(f'source projection update: {time.time()-time3:.2f}',
                      file=sys.stderr)

            # target side - compute sense mapping first
            time3 = time.time()
            zecc.fill(0.)
            for i in range(0, trg_size, args.trg_batch):
                end_idx = min(i + args.trg_batch, trg_size)
                zecc += z[i:end_idx].T.dot(
                    get_sparse_module(trg_senses[i:end_idx]).toarray()).T
            u, s, vt = xp.linalg.svd(cc.T.dot(zecc))
            wz = vt.T.dot(u.T).astype(dtype)
            z.dot(wz, out=zw)
            if args.verbose:
                print(f'target projection update: {time.time()-time3:.2f}',
                      file=sys.stderr)

        ### TODO add parts from 'advanced mapping' part - transformations, whitening, etc.

        # Objective function evaluation
        time_obj = time.time()
        trg_senses_l1 = float(trg_senses.sum())
        src_obj = (float(
            xp.linalg.norm(
                xw[args.skip_top:cutoff_end] -
                get_sparse_module(src_senses).dot(cc), 'fro'))**2) / 2
        trg_obj = (float(
            xp.linalg.norm(
                zw[args.skip_top:cutoff_end] -
                get_sparse_module(trg_senses).dot(cc), 'fro'))**2) / 2
        objective = src_obj + trg_obj + regularization_lambda * trg_senses_l1  # TODO consider thresholding reg part
        if args.verbose:
            print(f'objective calculation: {time.time()-time_obj:.2f}',
                  file=sys.stderr)

        if objective - best_objective <= -args.threshold:
            last_improvement = it
            best_objective = objective

        # WordNet transduction evaluation (can't tune on this)
        if args.gold_pairs is not None:
            np_trg_senses = trg_senses.get()
            trg_corr = [
                p for p in gold_pairs if np_trg_senses[p] > args.gold_threshold
            ]
            correct_mappings = len(trg_corr)
            domain_trgs = np_trg_senses[gold_trgs][:, gold_senses]
        else:
            correct_mappings = -1

        # Logging
        duration = time.time() - t
        if args.verbose:
            print('ITERATION {0} ({1:.2f}s)'.format(it, duration),
                  file=sys.stderr)
            print('objective: {0:.3f}'.format(objective), file=sys.stderr)
            print('target senses l_1 norm: {0:.3f}'.format(trg_senses_l1),
                  file=sys.stderr)
            if len(gold_pairs) > 0 and domain_trgs.getnnz() > 0:
                print(
                    f'{correct_mappings} correct target mappings: {(correct_mappings/len(gold_pairs)):.3f} recall, {(correct_mappings/domain_trgs.getnnz()):.3f} precision',
                    file=sys.stderr)
            print(file=sys.stderr)
            sys.stderr.flush()
        if args.log is not None:
            print(
                f'{it}\t{objective:.3f}\t{src_obj:.3f}\t{trg_obj:.3f}\t{trg_senses_l1:.3f}\t{duration:.3f}\t{trg_senses.getnnz()}\t{correct_mappings}',
                file=log)
            log.flush()

        if end:
            break

        t = time.time()
        it += 1

    # Write mapped embeddings
    with open(args.src_output,
              mode='w',
              encoding=args.encoding,
              errors='surrogateescape') as srcfile:
        embeddings.write(src_words, xw, srcfile)
    with open(args.trg_output,
              mode='w',
              encoding=args.encoding,
              errors='surrogateescape') as trgfile:
        embeddings.write(trg_words, zw, trgfile)

    # Write target sense mapping
    with open(args.tsns_output, mode='wb') as tsnsfile:
        pickle.dump(trg_senses.get(), tsnsfile)
Exemplo n.º 11
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(
        description=
        'Evaluate embeddings of two languages in a shared space in word translation induction'
    )
    parser.add_argument('src_embeddings',
                        help='the source language embeddings')
    parser.add_argument('trg_embeddings',
                        help='the target language embeddings')
    parser.add_argument('-d',
                        '--dictionary',
                        default=sys.stdin.fileno(),
                        help='the test dictionary file (defaults to stdin)')
    parser.add_argument('-o',
                        '--output-dictionary',
                        default=sys.stdin.fileno(),
                        help='path to the output dictionary')
    parser.add_argument(
        '--retrieval',
        default='nn',
        choices=['nn', 'invnn', 'invsoftmax', 'csls'],
        help=
        'the retrieval method (nn: standard nearest neighbor; invnn: inverted nearest neighbor; invsoftmax: inverted softmax; csls: cross-domain similarity local scaling)'
    )
    parser.add_argument(
        '--inv_temperature',
        default=1,
        type=float,
        help='the inverse temperature (only compatible with inverted softmax)')
    parser.add_argument(
        '--inv_sample',
        default=None,
        type=int,
        help=
        'use a random subset of the source vocabulary for the inverse computations (only compatible with inverted softmax)'
    )
    parser.add_argument(
        '-k',
        '--neighborhood',
        default=10,
        type=int,
        help='the neighborhood size (only compatible with csls)')
    parser.add_argument(
        '--dot',
        action='store_true',
        help=
        'use the dot product in the similarity computations instead of the cosine'
    )
    parser.add_argument(
        '--encoding',
        default='utf-8',
        help='the character encoding for input/output (defaults to utf-8)')
    parser.add_argument('--seed', type=int, default=0, help='the random seed')
    parser.add_argument('--precision',
                        choices=['fp16', 'fp32', 'fp64'],
                        default='fp32',
                        help='the floating-point precision (defaults to fp32)')
    parser.add_argument('--cuda',
                        action='store_true',
                        help='use cuda (requires cupy)')
    args = parser.parse_args()

    # Choose the right dtype for the desired precision
    if args.precision == 'fp16':
        dtype = 'float16'
    elif args.precision == 'fp32':
        dtype = 'float32'
    elif args.precision == 'fp64':
        dtype = 'float64'

    # Read input embeddings
    srcfile = open(args.src_embeddings,
                   encoding=args.encoding,
                   errors='surrogateescape')
    trgfile = open(args.trg_embeddings,
                   encoding=args.encoding,
                   errors='surrogateescape')
    src_words, x = embeddings.read(srcfile, dtype=dtype)
    trg_words, z = embeddings.read(trgfile, dtype=dtype)

    # NumPy/CuPy management
    if args.cuda:
        if not supports_cupy():
            print('ERROR: Install CuPy for CUDA support', file=sys.stderr)
            sys.exit(-1)
        xp = get_cupy()
        x = xp.asarray(x)
        z = xp.asarray(z)
    else:
        xp = np
    xp.random.seed(args.seed)

    # Length normalize embeddings so their dot product effectively computes the cosine similarity
    if not args.dot:
        embeddings.length_normalize(x)
        embeddings.length_normalize(z)

    # Build word to index map
    src_word2ind = {word: i for i, word in enumerate(src_words)}
    src_ind2word = {i: word for i, word in enumerate(src_words)}
    trg_ind2word = {i: word for i, word in enumerate(trg_words)}

    # Read dictionary and compute coverage
    f = open(args.dictionary, encoding=args.encoding, errors='surrogateescape')
    src = set()
    for line in f:
        src_word = line.split()[0]
        src_ind = src_word2ind.get(src_word, None)
        if src_ind is not None:
            src.add(src_ind)
    src = list(src)

    # Find translations
    translation = collections.defaultdict(int)
    if args.retrieval == 'nn':  # Standard nearest neighbor
        for i in range(0, len(src), BATCH_SIZE):
            j = min(i + BATCH_SIZE, len(src))
            similarities = x[src[i:j]].dot(z.T)
            nn = similarities.argmax(axis=1).tolist()
            for k in range(j - i):
                translation[src[i + k]] = nn[k]
    elif args.retrieval == 'invnn':  # Inverted nearest neighbor
        best_rank = np.full(len(src), x.shape[0], dtype=int)
        best_sim = np.full(len(src), -100, dtype=dtype)
        for i in range(0, z.shape[0], BATCH_SIZE):
            j = min(i + BATCH_SIZE, z.shape[0])
            similarities = z[i:j].dot(x.T)
            ind = (-similarities).argsort(axis=1)
            ranks = asnumpy(ind.argsort(axis=1)[:, src])
            sims = asnumpy(similarities[:, src])
            for k in range(i, j):
                for l in range(len(src)):
                    rank = ranks[k - i, l]
                    sim = sims[k - i, l]
                    if rank < best_rank[l] or (rank == best_rank[l]
                                               and sim > best_sim[l]):
                        best_rank[l] = rank
                        best_sim[l] = sim
                        translation[src[l]] = k
    elif args.retrieval == 'invsoftmax':  # Inverted softmax
        sample = xp.arange(
            x.shape[0]) if args.inv_sample is None else xp.random.randint(
                0, x.shape[0], args.inv_sample)
        partition = xp.zeros(z.shape[0])
        for i in range(0, len(sample), BATCH_SIZE):
            j = min(i + BATCH_SIZE, len(sample))
            partition += xp.exp(args.inv_temperature *
                                z.dot(x[sample[i:j]].T)).sum(axis=1)
        for i in range(0, len(src), BATCH_SIZE):
            j = min(i + BATCH_SIZE, len(src))
            p = xp.exp(args.inv_temperature * x[src[i:j]].dot(z.T)) / partition
            nn = p.argmax(axis=1).tolist()
            for k in range(j - i):
                translation[src[i + k]] = nn[k]
    elif args.retrieval == 'csls':  # Cross-domain similarity local scaling
        knn_sim_bwd = xp.zeros(z.shape[0])
        for i in range(0, z.shape[0], BATCH_SIZE):
            j = min(i + BATCH_SIZE, z.shape[0])
            knn_sim_bwd[i:j] = topk_mean(z[i:j].dot(x.T),
                                         k=args.neighborhood,
                                         inplace=True)
        for i in range(0, len(src), BATCH_SIZE):
            j = min(i + BATCH_SIZE, len(src))
            similarities = 2 * x[src[i:j]].dot(
                z.T) - knn_sim_bwd  # Equivalent to the real CSLS scores for NN
            nn = similarities.argmax(axis=1).tolist()
            for k in range(j - i):
                translation[src[i + k]] = nn[k]
    with open(args.output_dictionary, 'w') as fout:
        for src_ind in src:
            trg_ind = translation[src_ind]
            src_word = src_ind2word[src_ind]
            trg_word = trg_ind2word[trg_ind]
            fout.write('\t'.join([src_word, trg_word]) + '\n')
        fout.close()
Exemplo n.º 12
0
def filter_embeddings(in_embfname, filter_func):

    embeddings.read(in_embfile, max_voc=max_voc)
Exemplo n.º 13
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(
        description='Map word embeddings in two languages into a shared space')
    parser.add_argument('src_input', help='the input source embeddings')
    parser.add_argument('trg_input', help='the input target embeddings')
    parser.add_argument('src_output', help='the output source embeddings')
    parser.add_argument('trg_output', help='the output target embeddings')
    parser.add_argument(
        '--encoding',
        default='utf-8',
        help='the character encoding for input/output (defaults to utf-8)')
    parser.add_argument('--precision',
                        choices=['fp16', 'fp32', 'fp64'],
                        default='fp32',
                        help='the floating-point precision (defaults to fp32)')
    parser.add_argument('--cuda',
                        action='store_true',
                        help='use cuda (requires cupy)')
    parser.add_argument(
        '--batch_size',
        default=10000,
        type=int,
        help=
        'batch size (defaults to 10000); does not affect results, larger is usually faster but uses more memory'
    )
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='the random seed (defaults to 0)')
    parser.add_argument('--maxiter',
                        type=int,
                        default=10,
                        help='max number of iterations')
    parser.add_argument('--corekbest',
                        type=int,
                        default=2,
                        help='nn ranking to be considered as a match')
    parser.add_argument('--decayrate',
                        type=float,
                        default=1.01,
                        help='for boosting')
    parser.add_argument('--init_vocab',
                        type=int,
                        default=10000,
                        help='for boosting')
    parser.add_argument('--dictname',
                        default='dict.tmp',
                        help='output the dictionary')

    recommended_type = parser.add_argument_group(
        'recommended settings', 'Recommended settings for different scenarios')
    recommended_type.add_argument(
        '--supervised',
        metavar='DICTIONARY',
        help='recommended if you have a large training dictionary')
    recommended_type.add_argument(
        '--identical',
        default=True,
        help=
        'recommended if you have no seed dictionary but can rely on identical words'
    )

    init_group = parser.add_argument_group(
        'advanced initialization arguments',
        'Advanced initialization arguments')
    init_type = init_group.add_mutually_exclusive_group()
    init_type.add_argument(
        '-d',
        '--init_dictionary',
        default=sys.stdin.fileno(),
        metavar='DICTIONARY',
        help='the training dictionary file (defaults to stdin)')
    init_type.add_argument('--init_identical',
                           action='store_true',
                           help='use identical words as the seed dictionary')
    init_type.add_argument(
        '--init_numerals',
        action='store_true',
        help=
        'use latin numerals (i.e. words matching [0-9]+) as the seed dictionary'
    )
    init_type.add_argument('--init_unsupervised',
                           action='store_true',
                           help='use unsupervised initialization')
    init_group.add_argument(
        '--unsupervised_vocab',
        type=int,
        default=0,
        help=
        'restrict the vocabulary to the top k entries for unsupervised initialization'
    )

    mapping_group = parser.add_argument_group(
        'advanced mapping arguments', 'Advanced embedding mapping arguments')
    mapping_group.add_argument(
        '--normalize',
        choices=['unit', 'center', 'unitdim', 'centeremb', 'none'],
        nargs='*',
        default=[],
        help='the normalization actions to perform in order')
    mapping_group.add_argument('--vocabulary', help='restrict source vocab')
    mapping_type = mapping_group.add_mutually_exclusive_group()
    mapping_type.add_argument('-c',
                              '--orthogonal',
                              action='store_true',
                              help='use orthogonal constrained mapping')
    mapping_type.add_argument('-u',
                              '--unconstrained',
                              action='store_true',
                              help='use unconstrained mapping')

    self_learning_group = parser.add_argument_group(
        'advanced self-learning arguments',
        'Advanced arguments for self-learning')
    self_learning_group.add_argument(
        '--vocabulary_cutoff',
        type=int,
        default=0,
        help='restrict the vocabulary to the top k entries')
    self_learning_group.add_argument('--csls',
                                     type=int,
                                     nargs='?',
                                     default=0,
                                     const=10,
                                     metavar='NEIGHBORHOOD_SIZE',
                                     dest='csls_neighborhood',
                                     help='use CSLS for dictionary induction')
    self_learning_group.add_argument(
        '--validation',
        default=None,
        metavar='DICTIONARY',
        help='a dictionary file for validation at each iteration')
    self_learning_group.add_argument(
        '--log', help='write to a log file in tsv format at each iteration')
    self_learning_group.add_argument(
        '-v',
        '--verbose',
        action='store_true',
        help='write log information to stderr at each iteration')
    args = parser.parse_args()

    parser.set_defaults(init_dictionary=args.supervised,
                        normalize=['unit', 'center', 'unit'])
    args = parser.parse_args()
    print(args, file=sys.stderr)

    # Choose the right dtype for the desired precision
    if args.precision == 'fp16':
        dtype = 'float16'
    elif args.precision == 'fp32':
        dtype = 'float32'
    elif args.precision == 'fp64':
        dtype = 'float64'

    os.makedirs(OUTPUTDIR, exist_ok=True)

    # Read input embeddings
    vocabulary = None
    if args.vocabulary is not None:
        vocabulary = set()
        with open(args.vocabulary,
                  encoding=args.encoding,
                  errors='surrogateescape') as file:
            for l in file:
                vocabulary.add(l.split()[0])
        print(f'vocab size:\t{len(vocabulary)}')

    with open(args.src_input, encoding=args.encoding, errors='surrogateescape') as srcfile, \
            open(args.trg_input, encoding=args.encoding, errors='surrogateescape') as trgfile:
        src_words, x = embeddings.read(srcfile,
                                       dtype=dtype,
                                       threshold=args.vocabulary_cutoff,
                                       vocabulary=vocabulary)
        trg_words, z = embeddings.read(trgfile,
                                       dtype=dtype,
                                       threshold=args.vocabulary_cutoff)
        embeddings.normalize(x, args.normalize)
        embeddings.normalize(z, args.normalize)
    # NumPy/CuPy management
    if args.cuda:
        if not supports_cupy():
            print('ERROR: Install CuPy for CUDA support', file=sys.stderr)
            sys.exit(-1)
        xp = get_cupy()
        x = xp.asarray(x)
        z = xp.asarray(z)
    else:
        xp = np
    xp.random.seed(args.seed)

    # Build word to index map
    src_word2ind = {word: i for i, word in enumerate(src_words)}
    trg_word2ind = {word: i for i, word in enumerate(trg_words)}

    # Build the seed dictionary
    src_indices = []
    trg_indices = []

    if args.supervised:
        f = open(args.init_dictionary,
                 encoding=args.encoding,
                 errors='surrogateescape')
        for line in f:
            try:
                src, trg = line.split()[:2]
            except ValueError:
                continue
            try:
                src_ind = src_word2ind[src]
                trg_ind = trg_word2ind[trg]
                src_indices.append(src_ind)
                trg_indices.append(trg_ind)
            except KeyError:
                print('WARNING: OOV dictionary entry ({0} - {1})'.format(
                    src, trg),
                      file=sys.stderr)

    # Read validation dictionary
    if args.validation is not None:
        print('reading validation', file=sys.stderr)
        f = open(args.validation,
                 encoding=args.encoding,
                 errors='surrogateescape')
        validation = collections.defaultdict(set)
        oov = set()
        vocab = set()
        for line in f:
            try:
                src, trg = line.split()
            except ValueError:
                continue
            try:
                src_ind = src_word2ind[src]
                trg_ind = trg_word2ind[trg]
                validation[src_ind].add(trg_ind)
                vocab.add(src)
            except KeyError:
                oov.add(src)
        oov -= vocab  # If one of the translation options is in the vocabulary, then the entry is not an oov
        validation_coverage = len(validation) / (len(validation) + len(oov))

    # Create log file
    if args.log:
        log = open(args.log,
                   mode='w',
                   encoding=args.encoding,
                   errors='surrogateescape')

    # Allocate memory
    xw = xp.empty_like(x)
    zw = xp.empty_like(z)

    matches = collections.Counter()
    decided = collections.Counter()
    cum_weights = collections.Counter(matches)
    score = collections.Counter()
    for p in zip(src_indices, trg_indices):
        matches[p] = 1
        decided[p] = 1
    identical = set(src_words).intersection(set(trg_words))
    for word in list(identical):
        p = (src_word2ind[word], trg_word2ind[word])
        matches[p] = 1
        decided[p] = 1

    if args.validation is not None:
        simval = xp.empty((len(validation.keys()), z.shape[0]), dtype=dtype)

    # Training loop
    it = 1
    t = time.time()
    wprev = 0
    current_vocab = args.init_vocab
    Stats = collections.namedtuple(
        'MatchStats',
        ['w_dot', 'mean_dot', 'delta_w', 'current_vocab', 'len_match'])
    pstats = None
    stats = None
    while True:
        src_indices, trg_indices, weights = flatten_match(matches, matches)
        # x, z = np.array(x0), np.array(z0)

        embeddings.noise(x)
        embeddings.noise(z)

        if args.unconstrained:
            w = np.linalg.lstsq(np.sqrt(weights) * x[src_indices],
                                np.sqrt(weights) * z[trg_indices],
                                rcond=None)[0]
            # w = np.linalg.lstsq(x[src_indices], z[trg_indices], rcond=None)[0]
            x.dot(w, out=xw)
            zw = z[:]
        else:
            u, s, vt = xp.linalg.svd(
                (weights * z[trg_indices]).T.dot(x[src_indices]))
            # u, s, vt = xp.linalg.svd(z[trg_indices].T.dot(x[src_indices]))
            w = vt.T.dot(u.T)
            x.dot(w, out=xw)
            zw = z[:]
            w_dot = np.sum(
                weights * z[trg_indices] * xw[src_indices]) / weights.sum()
            mean_dot = np.sum(
                z[trg_indices] * xw[src_indices]) / len(src_indices)
            delta_w = np.linalg.norm(w - wprev)
            stats = Stats(w_dot=w_dot,
                          mean_dot=mean_dot,
                          delta_w=delta_w,
                          current_vocab=current_vocab,
                          len_match=len(src_indices))

        if it > 1 and stats.w_dot < pstats.w_dot:
            current_vocab = min(int(current_vocab * 1.1),
                                args.vocabulary_cutoff)

        T = 1 * np.exp((it - 1) * np.log(1e-2) / (args.maxiter))
        # T = 1
        score = collections.Counter()
        cum_weights = collections.Counter()
        matches, objective = find_matches(xw,
                                          zw,
                                          cum_weights,
                                          score,
                                          ul=current_vocab,
                                          T=T,
                                          kbest=args.corekbest,
                                          csls=args.csls_neighborhood,
                                          decay=args.decayrate)

        for m in decided:
            decided[m] = decided[m] * (1 - 1 / it)

        for m in score:
            if m in score:
                eta = 1 / it
            else:
                eta = max(0.5, 1 / it)
            decided[m] = decided[m] * (1 - eta) + score[m] * eta

        # Accuracy and similarity evaluation in validation
        if args.validation is not None:
            src = list(validation.keys())
            xw[src].dot(zw.T, out=simval)
            nn = asnumpy(simval.argmax(axis=1))
            accuracy = np.mean([
                1 if nn[i] in validation[src[i]] else 0
                for i in range(len(src))
            ])
            similarity = np.mean([
                np.max([simval[i, j].tolist() for j in validation[src[i]]])
                for i in range(len(src))
            ])

        with open(f'{OUTPUTDIR}/{args.dictname}.{it}', mode='w') as f:
            for p in decided.most_common():
                si, ti = p[0]
                print(f'{src_words[si]}\t{trg_words[ti]}\t{p[1]:.3e}', file=f)

        # Logging
        duration = time.time() - t

        if args.verbose:
            print(file=sys.stderr)
            print('ITERATION {0} ({1:.2f}s)'.format(it, duration),
                  file=sys.stderr)
            print('\t- Objective:        {0:9.4f}%'.format(100 * objective),
                  file=sys.stderr)
            print(
                f'\t- #match/#decided:             {len(src_indices)}/{len(decided)}',
                file=sys.stderr)
            print(stats, file=sys.stderr)
            if args.validation is not None:
                print('\t- Val. similarity:  {0:9.4f}%'.format(100 *
                                                               similarity),
                      file=sys.stderr)
                print('\t- Val. accuracy:    {0:9.4f}%'.format(100 * accuracy),
                      file=sys.stderr)
                print('\t- Val. coverage:    {0:9.4f}%'.format(
                    100 * validation_coverage),
                      file=sys.stderr)
            sys.stderr.flush()
        if args.log is not None:
            val = '{0:.6f}\t{1:.6f}\t{2:.6f}'.format(
                100 * similarity, 100 * accuracy, 100 *
                validation_coverage) if args.validation is not None else ''
            print('{0}\t{1:.6f}\t{2}\t{3:.6f}'.format(it, 100 * objective, val,
                                                      duration),
                  file=log)
            log.flush()

        if it >= args.maxiter:
            break
        t = time.time()
        wprev = w
        pstats = stats
        it += 1

    # write mapped embeddings
    print('**** reading and writing final embeddings ****', file=sys.stderr)
    with open(args.src_input, encoding=args.encoding, errors='surrogateescape') as srcfile, \
            open(args.trg_input, encoding=args.encoding, errors='surrogateescape') as trgfile:
        src_words, x = embeddings.read(srcfile, dtype=dtype, threshold=100000)
        trg_words, z = embeddings.read(trgfile, dtype=dtype, threshold=100000)

    embeddings.normalize(x, args.normalize)
    embeddings.normalize(z, args.normalize)

    with open(args.src_output, mode='w', encoding=args.encoding, errors='surrogateescape') as srcfile, \
            open(args.trg_output, mode='w', encoding=args.encoding, errors='surrogateescape') as trgfile:
        embeddings.write(src_words, x.dot(w), srcfile)
        embeddings.write(trg_words, z, trgfile)
Exemplo n.º 14
0
def add_oov_embeddings(train_dict_fname,
                       test_dict_fname,
                       src_emb_fname,
                       tgt_emb_fname,
                       out_src_emb_fname,
                       out_tgt_emb_fname,
                       src_model_path,
                       tgt_model_path,
                       fast_text_binary_path,
                       max_voc=200000,
                       emb_format='txt'):
    """
    Adds the embeddings for OOV words in the training and test dictionaries to the embedding file. 
    This is done by computing the embeddings using FastText. So, this method applies to FastText 
    embeddings only. Note that the output embedding file will contain only the OOV words plus 
    the first max_voc words in the original embedding file.
    
    train_dict_fname: 
    test_dict_fname: 
    src_emb_fname: embedding file for source language 
    tgt_emb_fname: embedding file for target language
    out_src_emb_fname: output embedding file for source language 
    out_tgt_emb_fname: output embedding file for target language    
    src_model_path: fasttext model for source language 
    tgt_model_path: fasttext model for targetqa language 
    fast_text_binary_path: path to fasttext binary
    max_voc: number of vocab items to process from the embedding file
    emb_format: format of embedding files. Currently supported: 'txt' - standard fast text format
    """

    ## read dictionaries
    train_dict = read_bilingual_dict(train_dict_fname)
    test_dict = read_bilingual_dict(test_dict_fname)

    # read embeddings
    src_vcb_words = None
    src_emb = None
    tgt_vcb_words = None
    tgt_emb = None

    with open(src_emb_fname, 'r', encoding='utf-8' ) as src_emb_file, \
         open(tgt_emb_fname, 'r', encoding='utf-8' ) as tgt_emb_file:
        src_vcb_words, src_emb = embeddings.read(src_emb_file, max_voc)
        tgt_vcb_words, tgt_emb = embeddings.read(tgt_emb_file, max_voc)

    ## find OOVs
    src_oov_words = set()
    src_oov_words.update(train_dict.keys())
    src_oov_words.update(test_dict.keys())
    src_oov_words.difference_update(src_vcb_words)
    print('Number of src OOV words: {}'.format(len(src_oov_words)))

    tgt_oov_words = set()

    tgt_oov_words.update(it.chain(train_dict.values()))
    tgt_oov_words.update(it.chain(test_dict.values()))
    tgt_oov_words.difference_update(tgt_vcb_words)
    print('Number of tgt OOV words: {}'.format(len(tgt_oov_words)))

    ## compute embeddings for OOV
    ##### cat queries.txt | ./fasttext print-word-vectors model.bin
    src_oov_final_words, src_oov_emb = compute_fasttext_embeddings(
        src_oov_words, src_model_path, fast_text_binary_path)
    tgt_oov_final_words, tgt_oov_emb = compute_fasttext_embeddings(
        tgt_oov_words, tgt_model_path, fast_text_binary_path)

    if (len(src_oov_words) != len(src_oov_final_words)):
        print(
            'WARNING: Embeddings not computed for {} words out of {} OOV source words'
            .format(
                len(src_oov_words) - len(src_oov_final_words),
                len(src_oov_words)))

    if (len(tgt_oov_words) != len(tgt_oov_final_words)):
        print(
            'WARNING: Embeddings not computed for {} words out of {} OOV target words'
            .format(
                len(tgt_oov_words) - len(tgt_oov_final_words),
                len(tgt_oov_words)))

    ## write new embeddings files to disk
    ## put the OOV words first followed by words in the original embeddings file
    with open(out_src_emb_fname, 'w', encoding='utf-8' ) as out_src_emb_file, \
         open(out_tgt_emb_fname, 'w', encoding='utf-8' ) as out_tgt_emb_file:
        embeddings.write(src_oov_final_words + src_vcb_words,
                         np.concatenate([src_oov_emb, src_emb]),
                         out_src_emb_file)
        embeddings.write(tgt_oov_final_words + tgt_vcb_words,
                         np.concatenate([tgt_oov_emb, tgt_emb]),
                         out_tgt_emb_file)
Exemplo n.º 15
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(description='Map word embeddings in two languages into a shared space')
    parser.add_argument('src_input', help='the input source embeddings')
    parser.add_argument('trg_input', help='the input target embeddings')
    parser.add_argument('src_output', help='the output source embeddings')
    parser.add_argument('-e', '--epochs', type=int, default=500, help='number of iterations')
    parser.add_argument('--pickle', action='store_true', help='load embedding from pickled object')
    parser.add_argument('--trg_output', help='the output target embeddings')
    parser.add_argument('--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)')
    parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp32', help='the floating-point precision (defaults to fp32)')
    parser.add_argument('--cuda', action='store_true', help='use cuda (requires cupy)')
    parser.add_argument('--batch_size', default=10000, type=int, help='batch size (defaults to 10000); does not affect results, larger is usually faster but uses more memory')
    parser.add_argument('--seed', type=int, default=0, help='the random seed (defaults to 0)')

    recommended_group = parser.add_argument_group('recommended settings', 'Recommended settings for different scenarios')
    recommended_type = recommended_group.add_mutually_exclusive_group()
    recommended_type.add_argument('--supervised', metavar='DICTIONARY', help='recommended if you have a large training dictionary')
    recommended_type.add_argument('--semi_supervised', metavar='DICTIONARY', help='recommended if you have a small seed dictionary')
    recommended_type.add_argument('--identical', action='store_true', help='recommended if you have no seed dictionary but can rely on identical words')
    recommended_type.add_argument('--unsupervised', action='store_true', help='recommended if you have no seed dictionary and do not want to rely on identical words')
    recommended_type.add_argument('--acl2018', action='store_true', help='reproduce our ACL 2018 system')
    recommended_type.add_argument('--aaai2018', metavar='DICTIONARY', help='reproduce our AAAI 2018 system')
    recommended_type.add_argument('--acl2017', action='store_true', help='reproduce our ACL 2017 system with numeral initialization')
    recommended_type.add_argument('--acl2017_seed', metavar='DICTIONARY', help='reproduce our ACL 2017 system with a seed dictionary')
    recommended_type.add_argument('--emnlp2016', metavar='DICTIONARY', help='reproduce our EMNLP 2016 system')

    init_group = parser.add_argument_group('advanced initialization arguments', 'Advanced initialization arguments')
    init_type = init_group.add_mutually_exclusive_group()
    init_type.add_argument('-d', '--init_dictionary', default=sys.stdin.fileno(), metavar='DICTIONARY', help='the training dictionary file (defaults to stdin)')
    init_type.add_argument('--init_identical', action='store_true', help='use identical words as the seed dictionary')
    init_type.add_argument('--init_numerals', action='store_true', help='use latin numerals (i.e. words matching [0-9]+) as the seed dictionary')
    init_type.add_argument('--init_unsupervised', action='store_true', help='use unsupervised initialization')
    init_group.add_argument('--unsupervised_vocab', type=int, default=0, help='restrict the vocabulary to the top k entries for unsupervised initialization')

    mapping_group = parser.add_argument_group('advanced mapping arguments', 'Advanced embedding mapping arguments')
    mapping_group.add_argument('--normalize', choices=['unit', 'center', 'unitdim', 'centeremb', 'none'], nargs='*', default=[], help='the normalization actions to perform in order')
    mapping_group.add_argument('--whiten', action='store_true', help='whiten the embeddings')
    mapping_group.add_argument('--src_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the source language embeddings')
    mapping_group.add_argument('--trg_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the target language embeddings')
    mapping_group.add_argument('--src_dewhiten', choices=['src', 'trg'], help='de-whiten the source language embeddings')
    mapping_group.add_argument('--trg_dewhiten', choices=['src', 'trg'], help='de-whiten the target language embeddings')
    mapping_group.add_argument('--dim_reduction', type=int, default=0, help='apply dimensionality reduction')
    mapping_type = mapping_group.add_mutually_exclusive_group()
    mapping_type.add_argument('-c', '--orthogonal', action='store_true', help='use orthogonal constrained mapping')
    mapping_type.add_argument('-u', '--unconstrained', action='store_true', help='use unconstrained mapping')

    self_learning_group = parser.add_argument_group('advanced self-learning arguments', 'Advanced arguments for self-learning')
    self_learning_group.add_argument('--self_learning', action='store_true', help='enable self-learning')
    self_learning_group.add_argument('--vocabulary_cutoff', type=int, default=0, help='restrict the vocabulary to the top k entries')
    self_learning_group.add_argument('--direction', choices=['forward', 'backward', 'union'], default='union', help='the direction for dictionary induction (defaults to union)')
    self_learning_group.add_argument('--csls', type=int, nargs='?', default=0, const=10, metavar='NEIGHBORHOOD_SIZE', dest='csls_neighborhood', help='use CSLS for dictionary induction')
    self_learning_group.add_argument('--threshold', default=0.000001, type=float, help='the convergence threshold (defaults to 0.000001)')
    self_learning_group.add_argument('--validation', default=None, metavar='DICTIONARY', help='a dictionary file for validation at each iteration')
    self_learning_group.add_argument('--stochastic_initial', default=0.1, type=float, help='initial keep probability stochastic dictionary induction (defaults to 0.1)')
    self_learning_group.add_argument('--stochastic_multiplier', default=2.0, type=float, help='stochastic dictionary induction multiplier (defaults to 2.0)')
    self_learning_group.add_argument('--stochastic_interval', default=50, type=int, help='stochastic dictionary induction interval (defaults to 50)')
    self_learning_group.add_argument('--log', help='write to a log file in tsv format at each iteration')
    self_learning_group.add_argument('-v', '--verbose', action='store_true', help='write log information to stderr at each iteration')
    args = parser.parse_args()

    if args.supervised is not None:
        parser.set_defaults(init_dictionary=args.supervised, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', batch_size=1000)
    if args.semi_supervised is not None:
        parser.set_defaults(init_dictionary=args.semi_supervised, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10)
    if args.identical:
        parser.set_defaults(init_identical=True, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10)
    if args.unsupervised or args.acl2018:
        parser.set_defaults(init_unsupervised=True, unsupervised_vocab=4000, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10)
    if args.aaai2018:
        parser.set_defaults(init_dictionary=args.aaai2018, normalize=['unit', 'center'], whiten=True, trg_reweight=1, src_dewhiten='src', trg_dewhiten='trg', batch_size=1000)
    if args.acl2017:
        parser.set_defaults(init_numerals=True, orthogonal=True, normalize=['unit', 'center'], self_learning=True, direction='forward', stochastic_initial=1.0, stochastic_interval=1, batch_size=1000)
    if args.acl2017_seed:
        parser.set_defaults(init_dictionary=args.acl2017_seed, orthogonal=True, normalize=['unit', 'center'], self_learning=True, direction='forward', stochastic_initial=1.0, stochastic_interval=1, batch_size=1000)
    if args.emnlp2016:
        parser.set_defaults(init_dictionary=args.emnlp2016, orthogonal=True, normalize=['unit', 'center'], batch_size=1000)
    args = parser.parse_args()

    # Check command line arguments
    if (args.src_dewhiten is not None or args.trg_dewhiten is not None) and not args.whiten:
        print('ERROR: De-whitening requires whitening first', file=sys.stderr)
        sys.exit(-1)

    # Choose the right dtype for the desired precision
    if args.precision == 'fp16':
        dtype = 'float16'
    elif args.precision == 'fp32':
        dtype = 'float32'
    elif args.precision == 'fp64':
        dtype = 'float64'

    # Read input embeddings
    if args.pickle:
        with open(args.src_input, 'rb') as fin:
            src_words, x = pickle.load(fin)
        with open(args.trg_input, 'rb') as fin:
            trg_words, z = pickle.load(fin)
    else:
        srcfile = open(args.src_input, encoding=args.encoding, errors='surrogateescape')
        trgfile = open(args.trg_input, encoding=args.encoding, errors='surrogateescape')
        src_words, x = embeddings.read(srcfile, dtype=dtype)
        trg_words, z = embeddings.read(trgfile, dtype=dtype)

    # NumPy/CuPy management
    if args.cuda:
        if not supports_cupy():
            print('ERROR: Install CuPy for CUDA support', file=sys.stderr)
            sys.exit(-1)
        xp = get_cupy()
        x = xp.asarray(x)
        z = xp.asarray(z)
    else:
        xp = np
    xp.random.seed(args.seed)

    # Build word to index map
    src_word2ind = {word: i for i, word in enumerate(src_words)}
    trg_word2ind = {word: i for i, word in enumerate(trg_words)}

    # STEP 0: Normalization
    embeddings.normalize(x, args.normalize)
    embeddings.normalize(z, args.normalize)

    # Build the seed dictionary
    src_indices = []
    trg_indices = []
    if args.init_unsupervised:
        sim_size = min(x.shape[0], z.shape[0]) if args.unsupervised_vocab <= 0 else min(x.shape[0], z.shape[0], args.unsupervised_vocab)
        u, s, vt = xp.linalg.svd(x[:sim_size], full_matrices=False)
        xsim = (u*s).dot(u.T)
        u, s, vt = xp.linalg.svd(z[:sim_size], full_matrices=False)
        zsim = (u*s).dot(u.T)
        del u, s, vt
        xsim.sort(axis=1)
        zsim.sort(axis=1)
        embeddings.normalize(xsim, args.normalize)
        embeddings.normalize(zsim, args.normalize)
        sim = xsim.dot(zsim.T)
        if args.csls_neighborhood > 0:
            knn_sim_fwd = topk_mean(sim, k=args.csls_neighborhood)
            knn_sim_bwd = topk_mean(sim.T, k=args.csls_neighborhood)
            sim -= knn_sim_fwd[:, xp.newaxis]/2 + knn_sim_bwd/2
        if args.direction == 'forward':
            src_indices = xp.arange(sim_size)
            trg_indices = sim.argmax(axis=1)
        elif args.direction == 'backward':
            src_indices = sim.argmax(axis=0)
            trg_indices = xp.arange(sim_size)
        elif args.direction == 'union':
            src_indices = xp.concatenate((xp.arange(sim_size), sim.argmax(axis=0)))
            trg_indices = xp.concatenate((sim.argmax(axis=1), xp.arange(sim_size)))
        del xsim, zsim, sim
    elif args.init_numerals:
        numeral_regex = re.compile('^[0-9]+$')
        src_numerals = {word for word in src_words if numeral_regex.match(word) is not None}
        trg_numerals = {word for word in trg_words if numeral_regex.match(word) is not None}
        numerals = src_numerals.intersection(trg_numerals)
        for word in numerals:
            src_indices.append(src_word2ind[word])
            trg_indices.append(trg_word2ind[word])
    elif args.init_identical:
        identical = set(src_words).intersection(set(trg_words))
        for word in identical:
            src_indices.append(src_word2ind[word])
            trg_indices.append(trg_word2ind[word])
    else:
        f = open(args.init_dictionary, encoding=args.encoding, errors='surrogateescape')
        for line in f:
            src, trg = line.split()
            try:
                src_ind = src_word2ind[src]
                trg_ind = trg_word2ind[trg]
                src_indices.append(src_ind)
                trg_indices.append(trg_ind)
            except KeyError:
                print('WARNING: OOV dictionary entry ({0} - {1})'.format(src, trg), file=sys.stderr)

    # Read validation dictionary
    if args.validation is not None:
        f = open(args.validation, encoding=args.encoding, errors='surrogateescape')
        validation = collections.defaultdict(set)
        oov = set()
        vocab = set()
        for line in f:
            src, trg = line.split()
            try:
                src_ind = src_word2ind[src]
                trg_ind = trg_word2ind[trg]
                validation[src_ind].add(trg_ind)
                vocab.add(src)
            except KeyError:
                oov.add(src)
        oov -= vocab  # If one of the translation options is in the vocabulary, then the entry is not an oov
        validation_coverage = len(validation) / (len(validation) + len(oov))

    # Create log file
    if args.log:
        log = open(args.log, mode='w', encoding=args.encoding, errors='surrogateescape')

    # Allocate memory
    xw = xp.empty_like(x)
    zw = xp.empty_like(z)
    src_size = x.shape[0] if args.vocabulary_cutoff <= 0 else min(x.shape[0], args.vocabulary_cutoff)
    trg_size = z.shape[0] if args.vocabulary_cutoff <= 0 else min(z.shape[0], args.vocabulary_cutoff)
    simfwd = xp.empty((args.batch_size, trg_size), dtype=dtype)
    simbwd = xp.empty((args.batch_size, src_size), dtype=dtype)
    if args.validation is not None:
        simval = xp.empty((len(validation.keys()), z.shape[0]), dtype=dtype)

    best_sim_forward = xp.full(src_size, -100, dtype=dtype)
    src_indices_forward = xp.arange(src_size)
    trg_indices_forward = xp.zeros(src_size, dtype=int)
    best_sim_backward = xp.full(trg_size, -100, dtype=dtype)
    src_indices_backward = xp.zeros(trg_size, dtype=int)
    trg_indices_backward = xp.arange(trg_size)
    knn_sim_fwd = xp.zeros(src_size, dtype=dtype)
    knn_sim_bwd = xp.zeros(trg_size, dtype=dtype)

    # Training loop
    best_objective = objective = -100.
    it = 1
    last_improvement = 0
    keep_prob = args.stochastic_initial
    t = time.time()
    end = not args.self_learning

    epoch = 0
    while True:
        epoch += 1
        if epoch == args.epochs:
            keep_prob = 1.0
        if epoch == args.epochs + 50:
            end = True

        # Increase the keep probability if we have not improve in args.stochastic_interval iterations
        if it - last_improvement > args.stochastic_interval:
            if keep_prob >= 1.0:
                end = True
            keep_prob = min(1.0, args.stochastic_multiplier*keep_prob)
            last_improvement = it

        # Update the embedding mapping
        if args.orthogonal or not end:  # orthogonal mapping
            u, s, vt = xp.linalg.svd(z[trg_indices].T.dot(x[src_indices]))
            w = vt.T.dot(u.T)
            x.dot(w, out=xw)
            zw[:] = z
        elif args.unconstrained:  # unconstrained mapping
            x_pseudoinv = xp.linalg.inv(x[src_indices].T.dot(x[src_indices])).dot(x[src_indices].T)
            w = x_pseudoinv.dot(z[trg_indices])
            x.dot(w, out=xw)
            zw[:] = z
        else:  # advanced mapping

            # TODO xw.dot(wx2, out=xw) and alike not working
            xw[:] = x
            zw[:] = z

            # STEP 1: Whitening
            def whitening_transformation(m):
                u, s, vt = xp.linalg.svd(m, full_matrices=False)
                return vt.T.dot(xp.diag(1/s)).dot(vt)
            if args.whiten:
                wx1 = whitening_transformation(xw[src_indices])
                wz1 = whitening_transformation(zw[trg_indices])
                xw = xw.dot(wx1)
                zw = zw.dot(wz1)

            # STEP 2: Orthogonal mapping
            wx2, s, wz2_t = xp.linalg.svd(xw[src_indices].T.dot(zw[trg_indices]))
            wz2 = wz2_t.T
            xw = xw.dot(wx2)
            zw = zw.dot(wz2)

            # STEP 3: Re-weighting
            xw *= s**args.src_reweight
            zw *= s**args.trg_reweight

            # STEP 4: De-whitening
            if args.src_dewhiten == 'src':
                xw = xw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2))
            elif args.src_dewhiten == 'trg':
                xw = xw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2))
            if args.trg_dewhiten == 'src':
                zw = zw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2))
            elif args.trg_dewhiten == 'trg':
                zw = zw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2))

            # STEP 5: Dimensionality reduction
            if args.dim_reduction > 0:
                xw = xw[:, :args.dim_reduction]
                zw = zw[:, :args.dim_reduction]

        # Self-learning
        if end:
            break
        else:
            # Update the training dictionary
            if args.direction in ('forward', 'union'):
                if args.csls_neighborhood > 0:
                    for i in range(0, trg_size, simbwd.shape[0]):
                        j = min(i + simbwd.shape[0], trg_size)
                        zw[i:j].dot(xw[:src_size].T, out=simbwd[:j-i])
                        knn_sim_bwd[i:j] = topk_mean(simbwd[:j-i], k=args.csls_neighborhood, inplace=True)
                for i in range(0, src_size, simfwd.shape[0]):
                    j = min(i + simfwd.shape[0], src_size)
                    xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j-i])
                    simfwd[:j-i].max(axis=1, out=best_sim_forward[i:j])
                    simfwd[:j-i] -= knn_sim_bwd/2  # Equivalent to the real CSLS scores for NN
                    dropout(simfwd[:j-i], 1 - keep_prob).argmax(axis=1, out=trg_indices_forward[i:j])
            if args.direction in ('backward', 'union'):
                if args.csls_neighborhood > 0:
                    for i in range(0, src_size, simfwd.shape[0]):
                        j = min(i + simfwd.shape[0], src_size)
                        xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j-i])
                        knn_sim_fwd[i:j] = topk_mean(simfwd[:j-i], k=args.csls_neighborhood, inplace=True)
                for i in range(0, trg_size, simbwd.shape[0]):
                    j = min(i + simbwd.shape[0], trg_size)
                    zw[i:j].dot(xw[:src_size].T, out=simbwd[:j-i])
                    simbwd[:j-i].max(axis=1, out=best_sim_backward[i:j])
                    simbwd[:j-i] -= knn_sim_fwd/2  # Equivalent to the real CSLS scores for NN
                    dropout(simbwd[:j-i], 1 - keep_prob).argmax(axis=1, out=src_indices_backward[i:j])
            if args.direction == 'forward':
                src_indices = src_indices_forward
                trg_indices = trg_indices_forward
            elif args.direction == 'backward':
                src_indices = src_indices_backward
                trg_indices = trg_indices_backward
            elif args.direction == 'union':
                src_indices = xp.concatenate((src_indices_forward, src_indices_backward))
                trg_indices = xp.concatenate((trg_indices_forward, trg_indices_backward))

            # Objective function evaluation
            if args.direction == 'forward':
                objective = xp.mean(best_sim_forward).tolist()
            elif args.direction == 'backward':
                objective = xp.mean(best_sim_backward).tolist()
            elif args.direction == 'union':
                objective = (xp.mean(best_sim_forward) + xp.mean(best_sim_backward)).tolist() / 2
            if objective - best_objective >= args.threshold:
                last_improvement = it
                best_objective = objective

            # Accuracy and similarity evaluation in validation
            if args.validation is not None:
                src = list(validation.keys())
                xw[src].dot(zw.T, out=simval)
                nn = asnumpy(simval.argmax(axis=1))
                accuracy = np.mean([1 if nn[i] in validation[src[i]] else 0 for i in range(len(src))])
                similarity = np.mean([max([simval[i, j].tolist() for j in validation[src[i]]]) for i in range(len(src))])

            # Logging
            duration = time.time() - t
            if args.verbose:
                print(file=sys.stderr)
                print('ITERATION {0} ({1:.2f}s)'.format(it, duration), file=sys.stderr)
                print('\t- Objective:        {0:9.4f}%'.format(100 * objective), file=sys.stderr)
                print('\t- Drop probability: {0:9.4f}%'.format(100 - 100*keep_prob), file=sys.stderr)
                if args.validation is not None:
                    print('\t- Val. similarity:  {0:9.4f}%'.format(100 * similarity), file=sys.stderr)
                    print('\t- Val. accuracy:    {0:9.4f}%'.format(100 * accuracy), file=sys.stderr)
                    print('\t- Val. coverage:    {0:9.4f}%'.format(100 * validation_coverage), file=sys.stderr)
                sys.stderr.flush()
            if args.log is not None:
                val = '{0:.6f}\t{1:.6f}\t{2:.6f}'.format(
                    100 * similarity, 100 * accuracy, 100 * validation_coverage) if args.validation is not None else ''
                print('{0}\t{1:.6f}\t{2}\t{3:.6f}'.format(it, 100 * objective, val, duration), file=log)
                log.flush()

        t = time.time()
        it += 1

    # Write mapped embeddings
    # srcfile = open(args.src_output, mode='w', encoding=args.encoding, errors='surrogateescape')
    # trgfile = open(args.trg_output, mode='w', encoding=args.encoding, errors='surrogateescape')
    dic = {
        'W_source': asnumpy(w),
        'W_target': np.identity(300, dtype=np.float32),
        'source_lang': 'en',
        'target_lang': args.trg_input.split('/')[1][:2] if args.trg_input.endswith('.bin') else args.trg_input.split('/')[5:7],
        'model': 'ubi',
        'note': 'vecmap',
    }
    with open(args.src_output, 'wb') as fout:
        pickle.dump(dic, fout)
Exemplo n.º 16
0
def translate(src_emb_fname,
              tgt_emb_fname,
              trans_tgt_fname=None,
              trans_src_fname=None,
              retrieval_method="csls",
              csls_k=10,
              batch_size=2500):

    print('Loading train data...')

    srcfile = open(src_emb_fname,
                   'r',
                   encoding='utf-8',
                   errors='surrogateescape')
    tgtfile = open(tgt_emb_fname,
                   'r',
                   encoding='utf-8',
                   errors='surrogateescape')

    # Read source embeddings
    src_words, x = embeddings.read(srcfile, max_voc=0, dtype='float32')
    src_word2ind = {word: i for i, word in enumerate(src_words)}

    # Read target embeddings
    tgt_words, z = embeddings.read(tgtfile, max_voc=0, dtype='float32')
    tgt_word2ind = {word: i for i, word in enumerate(tgt_words)}

    srcfile.close()
    tgtfile.close()

    xw = embeddings.length_normalize(x)
    zw = embeddings.length_normalize(z)

    all_words = []
    trans_words = []
    trans_idx = []
    oov = set()
    #if trans_src_fname is not None:
    if isinstance(trans_src_fname, str):
        with open(trans_src_fname,
                  'r',
                  encoding='utf-8',
                  errors='surrogateescape') as trans_src_file:
            for line in trans_src_file:
                try:
                    #w=line.strip().lower()
                    w = line.strip()
                    all_words.append(w)
                    w_ind = src_word2ind[w]
                    trans_words.append(w)
                    trans_idx.append(w_ind)
                except KeyError:
                    oov.add(w)
    elif isinstance(trans_src_fname, list):
        for w in trans_src_fname:
            try:
                all_words.append(w)
                w_ind = src_word2ind[w]
                trans_words.append(w)
                trans_idx.append(w_ind)
            except KeyError:
                oov.add(w)

    else:
        all_words = src_words
        trans_words = src_words
        trans_idx = list(range(len(src_words)))
        oov = set()

    print(len(all_words))
    print(len(trans_words))
    print(len(trans_idx))
    print(len(oov))
    src = trans_idx

    translation = collections.defaultdict(int)
    translation5 = collections.defaultdict(list)
    translation10 = collections.defaultdict(list)

    if retrieval_method == 'nn':  # Standard nearest neighbor
        for i in range(0, len(src), batch_size):
            j = min(i + batch_size, len(src))
            similarities = xw[src[i:j]].dot(zw.T)
            nn = similarities.argmax(axis=1).tolist()
            similarities_idx = similarities.argsort(axis=1)
            nn5 = similarities_idx[:, -5:]
            nn10 = similarities_idx[:, -10:]

            for k in range(j - i):
                translation[src[i + k]] = nn[k]
                translation5[src[i + k]] = nn5[k]
                translation10[src[i + k]] = nn10[k]

    elif retrieval_method == 'csls':
        t = time.time()
        nbrhood_x = np.zeros(xw.shape[0])
        nbrhood_z = np.zeros(zw.shape[0])
        nbrhood_z2 = cp.zeros(zw.shape[0])
        print('Computing X Neighbourhood')
        # batch_size=1000
        for i in range(0, len(src), batch_size):
            j = min(i + batch_size, len(src))
            similarities = xw[src[i:j]].dot(zw.T)
            # similarities_x = np.sort(similarities, axis=1)
            similarities_x = -1 * np.partition(
                -1 * similarities, csls_k - 1, axis=1)
            #similarities_x = -1*cp.partition(-1*cp.dot(cp.asarray(xw[src[i:j]]),cp.transpose(cp.asarray(zw))),csls_k-1 ,axis=1)[:,:csls_k]
            nbrhood_x[src[i:j]] = np.mean(similarities_x[:, :csls_k], axis=1)
        print('Completed in {0} seconds'.format(time.time() - t))
        print('Computing Z Neighbourhood')

        batch_num = 1
        for i in range(0, zw.shape[0], batch_size):
            j = min(i + batch_size, zw.shape[0])
            similarities = -1 * cp.partition(
                -1 * cp.dot(cp.asarray(zw[i:j]), cp.transpose(cp.asarray(xw))),
                csls_k - 1,
                axis=1)[:, :csls_k]
            nbrhood_z2[i:j] = (cp.mean(similarities[:, :csls_k], axis=1))
            print('Completed batch {0} in {1}'.format(batch_num,
                                                      time.time() - t))
            batch_num += 1
        # gc.collect()
        # t=time.time()
        nbrhood_z = cp.asnumpy(nbrhood_z2)
        # ipdb.set_trace()
        print(time.time() - t)
        csls_alpha = 1
        for i in range(0, len(src), batch_size):
            j = min(i + batch_size, len(src))
            similarities = xw[src[i:j]].dot(zw.T)
            similarities = np.transpose(
                np.transpose(2 * similarities) -
                csls_alpha * nbrhood_x[src[i:j]]) - csls_alpha * nbrhood_z
            nn = similarities.argmax(axis=1).tolist()
            print(time.time() - t)
            similarities = np.argsort((similarities), axis=1)

            nn5 = (similarities[:, -5:])
            nn10 = (similarities[:, -10:])
            for k in range(j - i):
                translation[src[i + k]] = nn[k]
                translation5[src[i + k]] = nn5[k]
                translation10[src[i + k]] = nn10[k]
        print('Completed in {0} seconds'.format(time.time() - t))

    # get translations
    trans_pairs = []
    for w in trans_words:
        trans = ''
        if w in src_word2ind:
            trans = tgt_words[translation[src_word2ind[w]]]
        if len(trans) > 0 or trans_tgt_fname is not None:
            ### include blank lines only in the case of writing output to file
            trans_pairs.append((w, trans))

    ### write the translations (1 pair per line format)
    if trans_tgt_fname is not None:
        with open(trans_tgt_fname,
                  'w',
                  encoding='utf-8',
                  errors='surrogateescape') as trans_tgt_file:
            for w, trans in trans_pairs:
                trans_tgt_file.write('{}\t{}\n'.format(w, trans))
    else:
        return dict(trans_pairs)
Exemplo n.º 17
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(description='Evaluate embeddings in word analogy')
    parser.add_argument('--src_embeddings', help='the word embeddings for source (left side)')
    parser.add_argument('--trg_embeddings', help='the word embeddings for target (right side)')
    parser.add_argument('-t', '--threshold', type=int, default=0, help='reduce vocabulary of the model for fast approximate evaluation (0 = off, otherwise typical value is 30,000)')
    parser.add_argument('-i', '--input', default=sys.stdin.fileno(), help='the test file (defaults to stdin)')
    parser.add_argument('-v', '--verbose', action='store_true', help='verbose output (give category specific results)')
    parser.add_argument('-l1', '--src_lowercase', action='store_true', help='lowercase the words in the test file')
    parser.add_argument('-l2', '--trg_lowercase', action='store_true', help='lowercase the words in the test file')    
    parser.add_argument('--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)')
    parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp32', help='the floating-point precision (defaults to fp32)')
    args = parser.parse_args()

    # Choose the right dtype for the desired precision
    if args.precision == 'fp16':
        dtype = 'float16'
    elif args.precision == 'fp32':
        dtype = 'float32'
    elif args.precision == 'fp64':
        dtype = 'float64'

    # Read input embeddings
    f = open(args.src_embeddings, encoding=args.encoding, errors='surrogateescape')
    src_words, src_matrix = embeddings.read(f, threshold=args.threshold, dtype=dtype)
    f.close()
    f = open(args.trg_embeddings, encoding=args.encoding, errors='surrogateescape')
    trg_words, trg_matrix = embeddings.read(f, threshold=args.threshold, dtype=dtype)
    f.close()
    # Build word to index map
    src_word2ind = {word: i for i, word in enumerate(src_words)}
    trg_word2ind = {word: i for i, word in enumerate(trg_words)}
    src_ind2word = {i: word for i, word in enumerate(src_words)}
    trg_ind2word = {i: word for i, word in enumerate(trg_words)}
    
    # Length normalize embeddings
    embeddings.length_normalize(src_matrix)
    embeddings.length_normalize(trg_matrix)
    
    # Parse test file
    # c-a+b ~ d
    f = open(args.input, encoding=args.encoding, errors='surrogateescape')
    categories = []
    a = [] #src lang
    b = [] #src lang
    c = [] #trg lang
    d = [] #trg lang
    linecounter = 0
    for line in f:
        if line.startswith(': '):
            name = line[2:-1]
            is_syntactic = name.startswith('gram')
            categories.append({'name': name, 'is_syntactic': is_syntactic, 'total': 0, 'oov': 0})
        else:
            try:
                words = line.split()
                #ind = [word2ind[word.lower() if args.lowercase else word] for word in line.split()]

                w0 = src_word2ind[words[0].lower() if args.src_lowercase else words[0]]
                w1 = src_word2ind[words[1].lower() if args.src_lowercase else words[1]]
                w2 = trg_word2ind[words[2].lower() if args.trg_lowercase else words[2]]
                w3 = trg_word2ind[words[3].lower() if args.trg_lowercase else words[3]]

                a.append(w0)
                b.append(w1)
                c.append(w2)
                d.append(w3)
                
                categories[-1]['total'] += 1
            except KeyError:
                categories[-1]['oov'] += 1
    total = len(a)

    # Compute nearest neighbors using efficient matrix multiplication
    nn = []
    for i in range(0, total, BATCH_SIZE):
        j = min(i + BATCH_SIZE, total)
        similarities = (trg_matrix[c[i:j]] - src_matrix[a[i:j]] + src_matrix[b[i:j]]).dot(trg_matrix.T)
        similarities[range(j-i), a[i:j]] = -1
        similarities[range(j-i), b[i:j]] = -1
        similarities[range(j-i), c[i:j]] = -1
        nn += np.argmax(similarities, axis=1).tolist()
    nn = np.array(nn)

    # Compute and print accuracies
    semantic = {'correct': 0, 'total': 0, 'oov': 0}
    syntactic = {'correct': 0, 'total': 0, 'oov': 0}
    ind = 0
    with open('crosslingual_predict.txt', 'w') as outfile:
        for i in range(len(nn)):
            outfile.write(src_ind2word[a[i]]+' '+src_ind2word[b[i]]+' '+trg_ind2word[c[i]]+' '+trg_ind2word[d[i]]+' | '+trg_ind2word[nn[i]]+'\n')
    for category in categories:
        current = syntactic if category['is_syntactic'] else semantic
        correct = np.sum(nn[ind:ind+category['total']] == d[ind:ind+category['total']])
        current['correct'] += correct
        current['total'] += category['total']
        current['oov'] += category['oov']
        ind += category['total']
        if args.verbose:
            print('Coverage:{0:7.2%}  Accuracy:{1:7.2%} | {2}'.format(
                category['total'] / (category['total'] + category['oov']),
                correct / category['total'],
                category['name']))
    if args.verbose:
        print('-'*80)
    print('Coverage:{0:7.2%}  Accuracy:{1:7.2%} (sem:{2:7.2%}, syn:{3:7.2%})'.format(
        (semantic['total'] + syntactic['total']) / (semantic['total'] + syntactic['total'] + semantic['oov'] + syntactic['oov']),
        (semantic['correct'] + syntactic['correct']) / (semantic['total'] + syntactic['total']),
        semantic['correct'] / semantic['total'],
        syntactic['correct'] / syntactic['total']))