예제 #1
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(description='Normalize word embeddings')
    parser.add_argument('actions',
                        choices=['unit', 'center', 'unitdim', 'centeremb'],
                        nargs='*',
                        default=[],
                        help='the actions to perform in order')
    parser.add_argument(
        '-i',
        '--input',
        default=sys.stdin.fileno(),
        help='the input word embedding file (defaults to stdin)')
    parser.add_argument(
        '-o',
        '--output',
        default=sys.stdout.fileno(),
        help='the output word embedding file (defaults to stdout)')
    parser.add_argument(
        '--encoding',
        default='utf-8',
        help='the character encoding for input/output (defaults to utf-8)')
    args = parser.parse_args()

    # Read input embeddings
    f = open(args.input, encoding=args.encoding, errors='surrogateescape')
    words, matrix = embeddings.read(f)

    # Perform normalization actions
    embeddings.normalize(matrix, args.actions)

    # Write normalized embeddings
    f = open(args.output,
             mode='w',
             encoding=args.encoding,
             errors='surrogateescape')
    embeddings.write(words, matrix, f)
예제 #2
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(description='Map word embeddings in two languages into a shared space')
    parser.add_argument('src_input', help='the input source embeddings')
    parser.add_argument('trg_input', help='the input target embeddings')
    parser.add_argument('src_output', help='the output source embeddings')
    parser.add_argument('trg_output', help='the output target embeddings')
    parser.add_argument('--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)')
    parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp32', help='the floating-point precision (defaults to fp32)')
    parser.add_argument('--cuda', action='store_true', help='use cuda (requires cupy)')
    parser.add_argument('--batch_size', default=10000, type=int, help='batch size (defaults to 10000); does not affect results, larger is usually faster but uses more memory')
    parser.add_argument('--seed', type=int, default=0, help='the random seed (defaults to 0)')
    parser.add_argument('--test-dict', help='the test dictionary file')

    recommended_group = parser.add_argument_group('recommended settings', 'Recommended settings for different scenarios')
    recommended_type = recommended_group.add_mutually_exclusive_group()
    recommended_type.add_argument('--supervised', metavar='DICTIONARY', help='recommended if you have a large training dictionary')
    recommended_type.add_argument('--semi_supervised', metavar='DICTIONARY', help='recommended if you have a small seed dictionary')
    recommended_type.add_argument('--identical', action='store_true', help='recommended if you have no seed dictionary but can rely on identical words')
    recommended_type.add_argument('--unsupervised', action='store_true', help='recommended if you have no seed dictionary and do not want to rely on identical words')
    recommended_type.add_argument('--acl2018', action='store_true', help='reproduce our ACL 2018 system')
    recommended_type.add_argument('--aaai2018', metavar='DICTIONARY', help='reproduce our AAAI 2018 system')
    recommended_type.add_argument('--acl2017', action='store_true', help='reproduce our ACL 2017 system with numeral initialization')
    # Note: changed the argument so that dictionary is supplied with -d instead
    recommended_type.add_argument('--acl2017_seed', action='store_true', help='reproduce our ACL 2017 system with a seed dictionary')
    recommended_type.add_argument('--emnlp2016', metavar='DICTIONARY', help='reproduce our EMNLP 2016 system')
    # still requires specifying a seed dictionary or another init
    recommended_type.add_argument('--ruder_emnlp2018', action='store_true', help='reproduce EMNLP 2018 latent-variable model of Ruder et al.')
    recommended_type.add_argument('--ruder_emnlp2018_backward', action='store_true', help='reproduce Ruder et al. (EMNLP 2018) with matching in backward direction')
    recommended_type.add_argument('--ruder_emnlp2018_artetxe_acl2018_unsupervised', action='store_true', help='reproduce Ruder et al. (EMNLP 2018) with matching in backward direction')
    recommended_type.add_argument('--ruder_emnlp2018_artetxe_acl2018', action='store_true', help='reproduce Ruder et al. (EMNLP 2018) with matching in backward direction')

    init_group = parser.add_argument_group('advanced initialization arguments', 'Advanced initialization arguments')
    init_type = init_group.add_mutually_exclusive_group()
    init_type.add_argument('-d', '--init_dictionary', default=sys.stdin.fileno(), metavar='DICTIONARY', help='the training dictionary file (defaults to stdin)')
    init_type.add_argument('--init_identical', action='store_true', help='use identical words as the seed dictionary')
    init_type.add_argument('--init_numerals', action='store_true', help='use latin numerals (i.e. words matching [0-9]+) as the seed dictionary')
    init_type.add_argument('--init_unsupervised', action='store_true', help='use unsupervised initialization')
    init_group.add_argument('--unsupervised_vocab', type=int, default=0, help='restrict the vocabulary to the top k entries for unsupervised initialization')

    mapping_group = parser.add_argument_group('advanced mapping arguments', 'Advanced embedding mapping arguments')
    mapping_group.add_argument('--normalize', choices=['unit', 'center', 'unitdim', 'centeremb', 'none'], nargs='*', default=[], help='the normalization actions to perform in order')
    mapping_group.add_argument('--whiten', action='store_true', help='whiten the embeddings')
    mapping_group.add_argument('--src_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the source language embeddings')
    mapping_group.add_argument('--trg_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the target language embeddings')
    mapping_group.add_argument('--src_dewhiten', choices=['src', 'trg'], help='de-whiten the source language embeddings')
    mapping_group.add_argument('--trg_dewhiten', choices=['src', 'trg'], help='de-whiten the target language embeddings')
    mapping_group.add_argument('--dim_reduction', type=int, default=0, help='apply dimensionality reduction')
    mapping_type = mapping_group.add_mutually_exclusive_group()
    mapping_type.add_argument('-c', '--orthogonal', action='store_true', help='use orthogonal constrained mapping')
    mapping_type.add_argument('-u', '--unconstrained', action='store_true', help='use unconstrained mapping')

    self_learning_group = parser.add_argument_group('advanced self-learning arguments', 'Advanced arguments for self-learning')
    self_learning_group.add_argument('--self_learning', action='store_true', help='enable self-learning')
    self_learning_group.add_argument('--vocabulary_cutoff', type=int, default=0, help='restrict the vocabulary to the top k entries')
    self_learning_group.add_argument('--direction', choices=['forward', 'backward', 'union'], default='union', help='the direction for dictionary induction (defaults to union)')
    self_learning_group.add_argument('--csls', type=int, nargs='?', default=0, const=10, metavar='NEIGHBORHOOD_SIZE', dest='csls_neighborhood', help='use CSLS for dictionary induction')
    self_learning_group.add_argument('--threshold', default=0.000001, type=float, help='the convergence threshold (defaults to 0.000001)')
    self_learning_group.add_argument('--validation', default=None, metavar='DICTIONARY', help='a dictionary file for validation at each iteration')
    self_learning_group.add_argument('--stochastic_initial', default=0.1, type=float, help='initial keep probability stochastic dictionary induction (defaults to 0.1)')
    self_learning_group.add_argument('--stochastic_multiplier', default=2.0, type=float, help='stochastic dictionary induction multiplier (defaults to 2.0)')
    self_learning_group.add_argument('--stochastic_interval', default=50, type=int, help='stochastic dictionary induction interval (defaults to 50)')
    self_learning_group.add_argument('--log', help='write to a log file in tsv format at each iteration')
    self_learning_group.add_argument('-v', '--verbose', action='store_true', help='write log information to stderr at each iteration')

    lat_var_group = parser.add_argument_group('arguments for latent-variable model', 'Arguments for latent-variable model')
    lat_var_group.add_argument('--lat-var', action='store_true', help='use the latent-variable model')
    lat_var_group.add_argument('--n-similar', type=int, default=3, help='# of most similar trg indices used for sparsifying in latent-variable model')
    lat_var_group.add_argument('--n-repeats', default=1, type=int, help='repeats embeddings to get 2:2, 3:3, etc. alignment in latent-variable model')
    lat_var_group.add_argument('--asym', default='1:1', help='specify 1:2 or 2:1 for assymmetric matching in latent-variable model')
    args = parser.parse_args()

    if args.supervised is not None:
        parser.set_defaults(init_dictionary=args.supervised, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', batch_size=1000)
    if args.semi_supervised is not None:
        parser.set_defaults(init_dictionary=args.semi_supervised, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10)
    if args.identical:
        parser.set_defaults(init_identical=True, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10)

    # reduce stochastic interval
    # note: just backward direction works surprisingly well
    if args.ruder_emnlp2018_artetxe_acl2018_unsupervised:
        parser.set_defaults(init_unsupervised=True, unsupervised_vocab=4000, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=40000, csls_neighborhood=10, lat_var=True, n_similar=3, direction='union', stochastic_interval=3)
    if args.ruder_emnlp2018_artetxe_acl2018:
        parser.set_defaults(normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=40000, csls_neighborhood=10, lat_var=True, n_similar=3, direction='union', stochastic_interval=3)
    if args.ruder_emnlp2018:
        parser.set_defaults(orthogonal=True, normalize=['unit', 'center'], self_learning=True, direction='forward', stochastic_initial=1.0, stochastic_interval=1, batch_size=1000, lat_var=True, n_similar=3, vocabulary_cutoff=40000)
    if args.ruder_emnlp2018_backward:
        parser.set_defaults(orthogonal=True, normalize=['unit', 'center'], self_learning=True, direction='backward', stochastic_initial=1.0, stochastic_interval=1, batch_size=1000, lat_var=True, n_similar=3, vocabulary_cutoff=40000)

    if args.unsupervised or args.acl2018:
        parser.set_defaults(init_unsupervised=True, unsupervised_vocab=4000, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10)
    if args.aaai2018:
        parser.set_defaults(init_dictionary=args.aaai2018, normalize=['unit', 'center'], whiten=True, trg_reweight=1, src_dewhiten='src', trg_dewhiten='trg', batch_size=1000)
    if args.acl2017:
        parser.set_defaults(init_numerals=True, orthogonal=True, normalize=['unit', 'center'], self_learning=True, direction='forward', stochastic_initial=1.0, stochastic_interval=1, batch_size=1000)
    if args.acl2017_seed:
        parser.set_defaults(init_dictionary=args.init_dictionary, orthogonal=True, normalize=['unit', 'center'], self_learning=True, direction='forward', stochastic_initial=1.0, stochastic_interval=1, batch_size=1000)
    if args.emnlp2016:
        parser.set_defaults(init_dictionary=args.emnlp2016, orthogonal=True, normalize=['unit', 'center'], batch_size=1000)
    args = parser.parse_args()

    # Check command line arguments
    if (args.src_dewhiten is not None or args.trg_dewhiten is not None) and not args.whiten:
        print('ERROR: De-whitening requires whitening first', file=sys.stderr)
        sys.exit(-1)

    if args.verbose:
        print("Info: arguments\n\t" + "\n\t".join(
            ["{}: {}".format(a, v) for a, v in vars(args).items()]),
              file=sys.stderr)

    # Choose the right dtype for the desired precision
    if args.precision == 'fp16':
        dtype = 'float16'
    elif args.precision == 'fp32':
        dtype = 'float32'
    elif args.precision == 'fp64':
        dtype = 'float64'

    # Read input embeddings
    srcfile = open(args.src_input, encoding=args.encoding, errors='surrogateescape')
    trgfile = open(args.trg_input, encoding=args.encoding, errors='surrogateescape')
    src_words, x = embeddings.read(srcfile, dtype=dtype, threshold=200000)
    trg_words, z = embeddings.read(trgfile, dtype=dtype, threshold=200000)

    # NumPy/CuPy management
    if args.cuda:
        if not supports_cupy():
            print('ERROR: Install CuPy for CUDA support', file=sys.stderr)
            sys.exit(-1)
        xp = get_cupy()
        x = xp.asarray(x)
        z = xp.asarray(z)
    else:
        xp = np
    xp.random.seed(args.seed)

    # Build word to index map
    src_word2ind = {word: i for i, word in enumerate(src_words)}
    trg_word2ind = {word: i for i, word in enumerate(trg_words)}

    # STEP 0: Normalization
    embeddings.normalize(x, args.normalize)
    embeddings.normalize(z, args.normalize)

    # Build the seed dictionary
    src_indices = []
    trg_indices = []
    if args.init_unsupervised:
        if args.verbose:
            print('Using unsupervised initialization...')
        sim_size = min(x.shape[0], z.shape[0]) if args.unsupervised_vocab <= 0 else min(x.shape[0], z.shape[0], args.unsupervised_vocab)
        u, s, vt = xp.linalg.svd(x[:sim_size], full_matrices=False)
        xsim = (u*s).dot(u.T)
        u, s, vt = xp.linalg.svd(z[:sim_size], full_matrices=False)
        zsim = (u*s).dot(u.T)
        del u, s, vt
        xsim.sort(axis=1)
        zsim.sort(axis=1)
        embeddings.normalize(xsim, args.normalize)
        embeddings.normalize(zsim, args.normalize)
        sim = xsim.dot(zsim.T)
        if args.csls_neighborhood > 0:
            knn_sim_fwd = topk_mean(sim, k=args.csls_neighborhood)
            knn_sim_bwd = topk_mean(sim.T, k=args.csls_neighborhood)
            sim -= knn_sim_fwd[:, xp.newaxis]/2 + knn_sim_bwd/2
        if args.direction == 'forward':
            src_indices = xp.arange(sim_size)
            trg_indices = sim.argmax(axis=1)
        elif args.direction == 'backward':
            src_indices = sim.argmax(axis=0)
            trg_indices = xp.arange(sim_size)
        elif args.direction == 'union':
            src_indices = xp.concatenate((xp.arange(sim_size), sim.argmax(axis=0)))
            trg_indices = xp.concatenate((sim.argmax(axis=1), xp.arange(sim_size)))
        del xsim, zsim, sim
    elif args.init_numerals:
        if args.verbose:
            print('Using numerals as seeds...')
        numeral_regex = re.compile('^[0-9]+$')
        src_numerals = {word for word in src_words if numeral_regex.match(word) is not None}
        trg_numerals = {word for word in trg_words if numeral_regex.match(word) is not None}
        numerals = src_numerals.intersection(trg_numerals)
        for word in numerals:
            src_indices.append(src_word2ind[word])
            trg_indices.append(trg_word2ind[word])
    elif args.init_identical:
        identical = set(src_words).intersection(set(trg_words))
        if args.verbose:
            print('Using identical strings as seeds...')
            print(f'Found {len(identical)} identical strings.')
        for word in identical:
            src_indices.append(src_word2ind[word])
            trg_indices.append(trg_word2ind[word])
    else:
        f = open(args.init_dictionary, encoding=args.encoding, errors='surrogateescape')
        for line in f:
            src, trg = line.split()
            try:
                src_ind = src_word2ind[src]
                trg_ind = trg_word2ind[trg]
                src_indices.append(src_ind)
                trg_indices.append(trg_ind)
            except KeyError:
                print('WARNING: OOV dictionary entry ({0} - {1})'.format(src, trg), file=sys.stderr)
        print(f'Using a dictionary of size {len(src_indices)}.')

    # Read validation dictionary
    if args.validation is not None:
        f = open(args.validation, encoding=args.encoding, errors='surrogateescape')
        validation = collections.defaultdict(set)
        oov = set()
        vocab = set()
        for line in f:
            src, trg = line.split()
            try:
                src_ind = src_word2ind[src]
                trg_ind = trg_word2ind[trg]
                validation[src_ind].add(trg_ind)
                vocab.add(src)
            except KeyError:
                oov.add(src)
        oov -= vocab  # If one of the translation options is in the vocabulary, then the entry is not an oov
        validation_coverage = len(validation) / (len(validation) + len(oov))

    # Create log file
    if args.log:
        log = open(args.log, mode='w', encoding=args.encoding, errors='surrogateescape')

    # Allocate memory
    xw = xp.empty_like(x)
    zw = xp.empty_like(z)
    src_size = x.shape[0] if args.vocabulary_cutoff <= 0 else min(x.shape[0], args.vocabulary_cutoff)
    trg_size = z.shape[0] if args.vocabulary_cutoff <= 0 else min(z.shape[0], args.vocabulary_cutoff)
    simfwd = xp.empty((args.batch_size, trg_size), dtype=dtype)
    simbwd = xp.empty((args.batch_size, src_size), dtype=dtype)
    if args.validation is not None:
        simval = xp.empty((len(validation.keys()), z.shape[0]), dtype=dtype)

    best_sim_forward = xp.full(src_size, -100, dtype=dtype)
    src_indices_forward = xp.arange(src_size)
    trg_indices_forward = xp.zeros(src_size, dtype=int)
    best_sim_backward = xp.full(trg_size, -100, dtype=dtype)
    src_indices_backward = xp.zeros(trg_size, dtype=int)
    trg_indices_backward = xp.arange(trg_size)
    knn_sim_fwd = xp.zeros(src_size, dtype=dtype)
    knn_sim_bwd = xp.zeros(trg_size, dtype=dtype)

    # Training loop
    best_objective = objective = -100.
    it = 1
    last_improvement = 0
    keep_prob = args.stochastic_initial
    t = time.time()
    end = not args.self_learning
    while True:

        # Increase the keep probability if we have not improve in args.stochastic_interval iterations
        if it - last_improvement > args.stochastic_interval:
            if keep_prob >= 1.0:
                end = True
            keep_prob = min(1.0, args.stochastic_multiplier*keep_prob)
            last_improvement = it

        # Update the embedding mapping
        if args.orthogonal or not end:  # orthogonal mapping
            u, s, vt = xp.linalg.svd(z[trg_indices].T.dot(x[src_indices]))
            w = vt.T.dot(u.T)
            x.dot(w, out=xw)
            zw[:] = z
        elif args.unconstrained:  # unconstrained mapping
            x_pseudoinv = xp.linalg.inv(x[src_indices].T.dot(x[src_indices])).dot(x[src_indices].T)
            w = x_pseudoinv.dot(z[trg_indices])
            x.dot(w, out=xw)
            zw[:] = z
        else:  # advanced mapping

            # TODO xw.dot(wx2, out=xw) and alike not working
            xw[:] = x
            zw[:] = z

            # STEP 1: Whitening
            def whitening_transformation(m):
                u, s, vt = xp.linalg.svd(m, full_matrices=False)
                return vt.T.dot(xp.diag(1/s)).dot(vt)
            if args.whiten:
                wx1 = whitening_transformation(xw[src_indices])
                wz1 = whitening_transformation(zw[trg_indices])
                xw = xw.dot(wx1)
                zw = zw.dot(wz1)

            # STEP 2: Orthogonal mapping
            wx2, s, wz2_t = xp.linalg.svd(xw[src_indices].T.dot(zw[trg_indices]))
            wz2 = wz2_t.T
            xw = xw.dot(wx2)
            zw = zw.dot(wz2)

            # STEP 3: Re-weighting
            xw *= s**args.src_reweight
            zw *= s**args.trg_reweight

            # STEP 4: De-whitening
            if args.src_dewhiten == 'src':
                xw = xw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2))
            elif args.src_dewhiten == 'trg':
                xw = xw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2))
            if args.trg_dewhiten == 'src':
                zw = zw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2))
            elif args.trg_dewhiten == 'trg':
                zw = zw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2))

            # STEP 5: Dimensionality reduction
            if args.dim_reduction > 0:
                xw = xw[:, :args.dim_reduction]
                zw = zw[:, :args.dim_reduction]

        # Self-learning
        if end:
            break
        else:
            # Update the training dictionary
            sims = np.zeros((src_size, trg_size), dtype=dtype)
            if args.direction in ('forward', 'union'):
                if args.csls_neighborhood > 0:
                    for i in range(0, trg_size, simbwd.shape[0]):
                        j = min(i + simbwd.shape[0], trg_size)
                        zw[i:j].dot(xw[:src_size].T, out=simbwd[:j-i])
                        knn_sim_bwd[i:j] = topk_mean(simbwd[:j-i], k=args.csls_neighborhood, inplace=True)
                for i in range(0, src_size, simfwd.shape[0]):
                    j = min(i + simfwd.shape[0], src_size)
                    xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j-i])
                    simfwd[:j-i].max(axis=1, out=best_sim_forward[i:j])
                    simfwd[:j-i] -= knn_sim_bwd/2  # Equivalent to the real CSLS scores for NN
                    simfwd[:j-i] = dropout(simfwd[:j-i], 1 - keep_prob)
                    if not args.lat_var:
                        # we get a dimension mismatch here as lat_var may produce fewer seeds
                        simfwd[:j-i].argmax(axis=1, out=trg_indices_forward[i:j])
                    sims[i:j] = simfwd
                if args.lat_var:
                    # TODO check if we can save memory by not storing a large sims matrix
                    src_indices_forward, trg_indices_forward = lat_var.lat_var(
                        xp, sims, args.n_similar, args.n_repeats, args.batch_size, args.asym)
            if args.direction in ('backward', 'union'):
                if args.csls_neighborhood > 0:
                    for i in range(0, src_size, simfwd.shape[0]):
                        j = min(i + simfwd.shape[0], src_size)
                        xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j-i])
                        knn_sim_fwd[i:j] = topk_mean(simfwd[:j-i], k=args.csls_neighborhood, inplace=True)
                for i in range(0, trg_size, simbwd.shape[0]):
                    j = min(i + simbwd.shape[0], trg_size)
                    zw[i:j].dot(xw[:src_size].T, out=simbwd[:j-i])
                    simbwd[:j-i].max(axis=1, out=best_sim_backward[i:j])
                    simbwd[:j-i] -= knn_sim_fwd/2  # Equivalent to the real CSLS scores for NN
                    simbwd[:j-i] = dropout(simbwd[:j-i], 1 - keep_prob)
                    if not args.lat_var:
                        simbwd[:j-i].argmax(axis=1,out=src_indices_backward[i:j])
                    sims[i:j] = simbwd
                if args.lat_var:
                    # swap the order of the indices
                    trg_indices_backward, src_indices_backward = lat_var.lat_var(
                        xp, sims, args.n_similar, args.n_repeats, args.batch_size, args.asym)
            if args.direction == 'forward':
                src_indices = src_indices_forward
                trg_indices = trg_indices_forward
            elif args.direction == 'backward':
                src_indices = src_indices_backward
                trg_indices = trg_indices_backward
            elif args.direction == 'union':
                src_indices = xp.concatenate((src_indices_forward, src_indices_backward))
                trg_indices = xp.concatenate((trg_indices_forward, trg_indices_backward))
            # elif args.direction == 'intersection':
            #     fwd_pairs = zip(src_indices_forward, trg_indices_forward)
            #     bwd_pairs = zip(src_indices_backward, trg_indices_backward)
            #     src_indices, trg_indices = zip(*set(fwd_pairs).intersection(bwd_pairs))
            #     src_indices, trg_indices = xp.array(src_indices), xp.array(trg_indices)

            # Objective function evaluation
            if args.direction == 'forward':
                objective = xp.mean(best_sim_forward).tolist()
            elif args.direction == 'backward':
                objective = xp.mean(best_sim_backward).tolist()
            elif args.direction == 'union':
                objective = (xp.mean(best_sim_forward) + xp.mean(best_sim_backward)).tolist() / 2
            if objective - best_objective >= args.threshold:
                last_improvement = it
                best_objective = objective

            # Accuracy and similarity evaluation in validation
            if args.validation is not None:
                src = list(validation.keys())
                xw[src].dot(zw.T, out=simval)
                nn = asnumpy(simval.argmax(axis=1))
                accuracy = np.mean([1 if nn[i] in validation[src[i]] else 0 for i in range(len(src))])
                similarity = np.mean([max([simval[i, j].tolist() for j in validation[src[i]]]) for i in range(len(src))])

            # Logging
            duration = time.time() - t
            if args.verbose:
                print(file=sys.stderr)
                print('ITERATION {0} ({1:.2f}s)'.format(it, duration), file=sys.stderr)
                print('\t- Objective:        {0:9.4f}%'.format(100 * objective), file=sys.stderr)
                print('\t- Drop probability: {0:9.4f}%'.format(100 - 100*keep_prob), file=sys.stderr)
                if args.validation is not None:
                    print('\t- Val. similarity:  {0:9.4f}%'.format(100 * similarity), file=sys.stderr)
                    print('\t- Val. accuracy:    {0:9.4f}%'.format(100 * accuracy), file=sys.stderr)
                    print('\t- Val. coverage:    {0:9.4f}%'.format(100 * validation_coverage), file=sys.stderr)
                sys.stderr.flush()
            if args.log is not None:
                val = '{0:.6f}\t{1:.6f}\t{2:.6f}'.format(
                    100 * similarity, 100 * accuracy, 100 * validation_coverage) if args.validation is not None else ''
                print('{0}\t{1:.6f}\t{2}\t{3:.6f}'.format(it, 100 * objective, val, duration), file=log)
                log.flush()

        t = time.time()
        it += 1

        if args.test_dict:
            # save the embeddings for evaluation
            with open(args.src_output, mode='w', encoding=args.encoding, errors='surrogateescape') as srcfile,\
                    open(args.trg_output, mode='w', encoding=args.encoding, errors='surrogateescape') as trgfile:
                embeddings.write(src_words, xw, srcfile)
                embeddings.write(trg_words, zw, trgfile)

            # EVALUATING TRANSLATION
            print('Evaluating translation...')

            # we skip length normalization here

            # Read dictionary and compute coverage
            f = open(args.test_dict, encoding=args.encoding,
                     errors='surrogateescape')
            src2trg = collections.defaultdict(set)
            oov = set()
            vocab = set()
            for line in f:
                src, trg = line.split()
                try:
                    src_ind = src_word2ind[src]
                    trg_ind = trg_word2ind[trg]
                    src2trg[src_ind].add(trg_ind)
                    vocab.add(src)
                except KeyError:
                    oov.add(src)
            src = list(src2trg.keys())
            oov -= vocab  # If one of the translation options is in the vocabulary, then the entry is not an oov
            coverage = len(src2trg) / (len(src2trg) + len(oov))

            BATCH_SIZE = 500

            # Find translations
            translation = collections.defaultdict(int)

            # we just use nearest neighbour for retrieval
            for i in range(0, len(src), BATCH_SIZE):
                j = min(i + BATCH_SIZE, len(src))
                similarities = xw[src[i:j]].dot(zw.T)
                nn = similarities.argmax(axis=1).tolist()
                for k in range(j - i):
                    translation[src[i + k]] = nn[k]

            # Compute accuracy
            accuracy = np.mean(
                [1 if translation[i] in src2trg[i] else 0 for i in src])
            print('Coverage:{0:7.2%}  Accuracy:{1:7.2%}'.format(coverage, accuracy))

    # Write mapped embeddings
    with open(args.src_output, mode='w', encoding=args.encoding, errors='surrogateescape') as srcfile, \
            open(args.trg_output, mode='w', encoding=args.encoding, errors='surrogateescape') as trgfile:
        embeddings.write(src_words, xw, srcfile)
        embeddings.write(trg_words, zw, trgfile)
예제 #3
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(description='Map word embeddings in two languages into a shared space')
    parser.add_argument('src_input', help='the input source embeddings')
    parser.add_argument('trg_input', help='the input target embeddings')
    parser.add_argument('src_output', help='the output source embeddings')
    parser.add_argument('trg_output', help='the output target embeddings')
    parser.add_argument('dict_output', default='dictionary.pkl', help='the output dictionary pickle file')
    parser.add_argument('--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)')
    parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp32', help='the floating-point precision (defaults to fp32)')
    parser.add_argument('--cuda', action='store_true', help='use cuda (requires cupy)')
    parser.add_argument('--batch_size', default=10000, type=int, help='batch size (defaults to 10000); does not affect results, larger is usually faster but uses more memory')
    parser.add_argument('--seed', type=int, default=0, help='the random seed (defaults to 0)')

    recommended_group = parser.add_argument_group('recommended settings', 'Recommended settings for different scenarios')
    recommended_type = recommended_group.add_mutually_exclusive_group()
    recommended_type.add_argument('--supervised', metavar='DICTIONARY', help='recommended if you have a large training dictionary')
    recommended_type.add_argument('--semi_supervised', metavar='DICTIONARY', help='recommended if you have a small seed dictionary')
    recommended_type.add_argument('--identical', action='store_true', help='recommended if you have no seed dictionary but can rely on identical words')
    recommended_type.add_argument('--unsupervised', action='store_true', help='recommended if you have no seed dictionary and do not want to rely on identical words')
    recommended_type.add_argument('--future', action='store_true', help='experiment with stuff')
    recommended_type.add_argument('--acl2018', action='store_true', help='reproduce our ACL 2018 system')
    recommended_type.add_argument('--aaai2018', metavar='DICTIONARY', help='reproduce our AAAI 2018 system')
    recommended_type.add_argument('--acl2017', action='store_true', help='reproduce our ACL 2017 system with numeral initialization')
    recommended_type.add_argument('--acl2017_seed', metavar='DICTIONARY', help='reproduce our ACL 2017 system with a seed dictionary')
    recommended_type.add_argument('--emnlp2016', metavar='DICTIONARY', help='reproduce our EMNLP 2016 system')

    init_group = parser.add_argument_group('advanced initialization arguments', 'Advanced initialization arguments')
    init_type = init_group.add_mutually_exclusive_group()
    init_type.add_argument('-d', '--init_dictionary', default=sys.stdin.fileno(), metavar='DICTIONARY', help='the training dictionary file (defaults to stdin)')
    init_type.add_argument('--init_identical', action='store_true', help='use identical words as the seed dictionary')
    init_type.add_argument('--init_numerals', action='store_true', help='use latin numerals (i.e. words matching [0-9]+) as the seed dictionary')
    init_type.add_argument('--init_unsupervised', action='store_true', help='use unsupervised initialization')
    init_group.add_argument('--unsupervised_vocab', type=int, default=0, help='restrict the vocabulary to the top k entries for unsupervised initialization')

    mapping_group = parser.add_argument_group('advanced mapping arguments', 'Advanced embedding mapping arguments')
    mapping_group.add_argument('--normalize', choices=['unit', 'center', 'unitdim', 'centeremb', 'none'], nargs='*', default=[], help='the normalization actions to perform in order')
    mapping_group.add_argument('--whiten', action='store_true', help='whiten the embeddings')
    mapping_group.add_argument('--src_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the source language embeddings')
    mapping_group.add_argument('--trg_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the target language embeddings')
    mapping_group.add_argument('--src_dewhiten', choices=['src', 'trg'], help='de-whiten the source language embeddings')
    mapping_group.add_argument('--trg_dewhiten', choices=['src', 'trg'], help='de-whiten the target language embeddings')
    mapping_group.add_argument('--dim_reduction', type=int, default=0, help='apply dimensionality reduction')
    mapping_type = mapping_group.add_mutually_exclusive_group()
    mapping_type.add_argument('-c', '--orthogonal', action='store_true', help='use orthogonal constrained mapping')
    mapping_type.add_argument('-u', '--unconstrained', action='store_true', help='use unconstrained mapping')
    
    future_group = parser.add_argument_group('experimental arguments', 'Experimental arguments')
    future_group.add_argument('--max_align', type=int, default=1, help='Number of top-ranked elements to align to each word (defaults to 1=base)')
    future_group.add_argument('--align_weight', choices=['unit', 'rr', 'softmax'], default='rr', help='Weights assigned to ranked elements in maximization phase (unit - no weighting; rr - reciprocal rank; softmax - NOT IMPLEMENTED YET)')

    self_learning_group = parser.add_argument_group('advanced self-learning arguments', 'Advanced arguments for self-learning')
    self_learning_group.add_argument('--self_learning', action='store_true', help='enable self-learning')
    self_learning_group.add_argument('--vocabulary_cutoff', type=int, default=0, help='restrict the vocabulary to the top k entries')
    self_learning_group.add_argument('--direction', choices=['forward', 'backward', 'union'], default='union', help='the direction for dictionary induction (defaults to union)')
    self_learning_group.add_argument('--csls', type=int, nargs='?', default=0, const=10, metavar='NEIGHBORHOOD_SIZE', dest='csls_neighborhood', help='use CSLS for dictionary induction')
    self_learning_group.add_argument('--threshold', default=0.000001, type=float, help='the convergence threshold (defaults to 0.000001)')
    self_learning_group.add_argument('--validation', default=None, metavar='DICTIONARY', help='a dictionary file for validation at each iteration')
    self_learning_group.add_argument('--stochastic_initial', default=0.1, type=float, help='initial keep probability stochastic dictionary induction (defaults to 0.1)')
    self_learning_group.add_argument('--stochastic_multiplier', default=2.0, type=float, help='stochastic dictionary induction multiplier (defaults to 2.0)')
    self_learning_group.add_argument('--stochastic_interval', default=50, type=int, help='stochastic dictionary induction interval (defaults to 50)')
    self_learning_group.add_argument('--log', default='map.log', help='write to a log file in tsv format at each iteration')
    self_learning_group.add_argument('-v', '--verbose', action='store_true', help='write log information to stderr at each iteration')
    args = parser.parse_args()

    if args.supervised is not None:
        parser.set_defaults(init_dictionary=args.supervised, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', batch_size=1000)
    if args.semi_supervised is not None:
        parser.set_defaults(init_dictionary=args.semi_supervised, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10)
    if args.identical:
        parser.set_defaults(init_identical=True, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10)
    
    if args.unsupervised or args.future:
        parser.set_defaults(init_unsupervised=True, unsupervised_vocab=4000, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10, max_align=2, align_weight='rr')
    if args.unsupervised or args.acl2018:
        parser.set_defaults(init_unsupervised=True, unsupervised_vocab=4000, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10)
    if args.aaai2018:
        parser.set_defaults(init_dictionary=args.aaai2018, normalize=['unit', 'center'], whiten=True, trg_reweight=1, src_dewhiten='src', trg_dewhiten='trg', batch_size=1000)
    if args.acl2017:
        parser.set_defaults(init_numerals=True, orthogonal=True, normalize=['unit', 'center'], self_learning=True, direction='forward', stochastic_initial=1.0, stochastic_interval=1, batch_size=1000)
    if args.acl2017_seed:
        parser.set_defaults(init_dictionary=args.acl2017_seed, orthogonal=True, normalize=['unit', 'center'], self_learning=True, direction='forward', stochastic_initial=1.0, stochastic_interval=1, batch_size=1000)
    if args.emnlp2016:
        parser.set_defaults(init_dictionary=args.emnlp2016, orthogonal=True, normalize=['unit', 'center'], batch_size=1000)
    args = parser.parse_args()

    # Check command line arguments
    if (args.src_dewhiten is not None or args.trg_dewhiten is not None) and not args.whiten:
        print('ERROR: De-whitening requires whitening first', file=sys.stderr)
        sys.exit(-1)

    # Choose the right dtype for the desired precision
    if args.precision == 'fp16':
        dtype = 'float16'
    elif args.precision == 'fp32':
        dtype = 'float32'
    elif args.precision == 'fp64':
        dtype = 'float64'

    # Read input embeddings
    print('reading embeddings...')
    srcfile = open(args.src_input, encoding=args.encoding, errors='surrogateescape')
    trgfile = open(args.trg_input, encoding=args.encoding, errors='surrogateescape')
    src_words, x = embeddings.read(srcfile, dtype=dtype)
    trg_words, z = embeddings.read(trgfile, dtype=dtype)
    print('embeddings read')

    # NumPy/CuPy management
    if args.cuda:
        if not supports_cupy():
            print('ERROR: Install CuPy for CUDA support', file=sys.stderr)
            sys.exit(-1)
        xp = get_cupy()
        x = xp.asarray(x)
        z = xp.asarray(z)
        print('CUDA loaded')
    else:
        xp = np
    xp.random.seed(args.seed)

    # Build word to index map (only relevant in supervised learning or with validation)
    src_word2ind = {word: i for i, word in enumerate(src_words)}
    print(f'mapped {len(src_words)} source words')
    trg_word2ind = {word: i for i, word in enumerate(trg_words)}
    print(f'mapped {len(trg_words)} target words')

    # STEP 0: Normalization
    embeddings.normalize(x, args.normalize)
    embeddings.normalize(z, args.normalize)
    print('normalization complete')

    # Build the seed dictionary
    src_indices = []
    trg_indices = []
    if args.init_unsupervised:
        sim_size = min(x.shape[0], z.shape[0]) if args.unsupervised_vocab <= 0 else min(x.shape[0], z.shape[0], args.unsupervised_vocab)
        u, s, vt = xp.linalg.svd(x[:sim_size], full_matrices=False)
        xsim = (u*s).dot(u.T)
        u, s, vt = xp.linalg.svd(z[:sim_size], full_matrices=False)
        zsim = (u*s).dot(u.T)
        del u, s, vt
        xsim.sort(axis=1)
        zsim.sort(axis=1)
        embeddings.normalize(xsim, args.normalize)
        embeddings.normalize(zsim, args.normalize)
        sim = xsim.dot(zsim.T)
        if args.csls_neighborhood > 0:
            knn_sim_fwd = topk_mean(sim, k=args.csls_neighborhood)
            knn_sim_bwd = topk_mean(sim.T, k=args.csls_neighborhood)
            sim -= knn_sim_fwd[:, xp.newaxis]/2 + knn_sim_bwd/2
        if args.direction == 'forward':
            src_indices = xp.arange(sim_size)
            trg_indices = sim.argmax(axis=1)
        elif args.direction == 'backward':
            src_indices = sim.argmax(axis=0)
            trg_indices = xp.arange(sim_size)
        elif args.direction == 'union':
            src_indices = xp.concatenate((xp.arange(sim_size), sim.argmax(axis=0)))
            trg_indices = xp.concatenate((sim.argmax(axis=1), xp.arange(sim_size)))
        del xsim, zsim, sim
        print(f'initialized unsupervised dictionary')
    elif args.init_numerals:
        numeral_regex = re.compile('^[0-9]+$')
        src_numerals = {word for word in src_words if numeral_regex.match(word) is not None}
        trg_numerals = {word for word in trg_words if numeral_regex.match(word) is not None}
        numerals = src_numerals.intersection(trg_numerals)
        for word in numerals:
            src_indices.append(src_word2ind[word])
            trg_indices.append(trg_word2ind[word])
        print('initialized numeral dictionary')
    elif args.init_identical:
        identical = set(src_words).intersection(set(trg_words))
        for word in identical:
            src_indices.append(src_word2ind[word])
            trg_indices.append(trg_word2ind[word])
        print('initialized identical dictionary')
    else:
        f = open(args.init_dictionary, encoding=args.encoding, errors='surrogateescape')
        for line in f:
            src, trg = line.split()
            try:
                src_ind = src_word2ind[src]
                trg_ind = trg_word2ind[trg]
                src_indices.append(src_ind)
                trg_indices.append(trg_ind)
            except KeyError:
                print('WARNING: OOV dictionary entry ({0} - {1})'.format(src, trg), file=sys.stderr)
        f.close()
        print('initialized seed dictionary')

    # Read validation dictionary
    if args.validation is not None:
        f = open(args.validation, encoding=args.encoding, errors='surrogateescape')
        validation = collections.defaultdict(set)
        oov = set()
        vocab = set()
        for line in f:
            src, trg = line.split()
            try:
                src_ind = src_word2ind[src]
                trg_ind = trg_word2ind[trg]
                validation[src_ind].add(trg_ind)
                vocab.add(src)
            except KeyError:
                oov.add(src)
        oov -= vocab  # If one of the translation options is in the vocabulary, then the entry is not an oov
        validation_coverage = len(validation) / (len(validation) + len(oov))
        print(f'loaded validation dictionary with {validation_coverage:.3f} coverage')

    # Create log file
    if args.log:
        log = open(args.log, mode='w', encoding=args.encoding, errors='surrogateescape')
        print(f'logging into {args.log}')

    # Allocate memory
    xw = xp.empty_like(x)
    zw = xp.empty_like(z)
    src_size = x.shape[0] if args.vocabulary_cutoff <= 0 else min(x.shape[0], args.vocabulary_cutoff)
    trg_size = z.shape[0] if args.vocabulary_cutoff <= 0 else min(z.shape[0], args.vocabulary_cutoff)
    simfwd = xp.empty((min(src_size, args.batch_size), trg_size), dtype=dtype)
    simbwd = xp.empty((min(trg_size, args.batch_size), src_size), dtype=dtype)
    #argsimsf = xp.empty((min(src_size, args.batch_size), args.max_align), dtype=int)
    #argsimsb = xp.empty((min(trg_size, args.batch_size), args.max_align), dtype=int)
    argsimsf = xp.empty((min(src_size, args.batch_size), 1), dtype=int)
    argsimsb = xp.empty((min(trg_size, args.batch_size), 1), dtype=int)
    if args.validation is not None:
        simval = xp.empty((len(validation.keys()), z.shape[0]), dtype=dtype)

    best_sim_forward = xp.full(src_size, -100, dtype=dtype)
    src_indices_forward = xp.array(list(range(src_size)) * args.max_align)
    trg_indices_forward = xp.zeros(src_size * args.max_align, dtype=int)
    best_sim_backward = xp.full(trg_size, -100, dtype=dtype)
    src_indices_backward = xp.zeros(trg_size * args.max_align, dtype=int)
    trg_indices_backward = xp.array(list(range(trg_size)) * args.max_align)
    xr = xp.zeros(((src_size+trg_size) * args.max_align, x.shape[1]), dtype=dtype)  # assumes "both" param
    zr = xp.zeros(((src_size+trg_size) * args.max_align, z.shape[1]), dtype=dtype)  # assumes "both" param
    all_coefs = xp.zeros(((src_size+trg_size) * args.max_align, 1), dtype=dtype)
    knn_sim_fwd = xp.zeros(src_size, dtype=dtype)
    knn_sim_bwd = xp.zeros(trg_size, dtype=dtype)

    # Training loop
    best_objective = objective = -100.
    it = 1
    last_improvement = 0
    keep_prob = args.stochastic_initial
    t = time.time()
    end = not args.self_learning
    print('starting training')
    while True:
        if it % 50 == 0:
            print(f'starting iteration {it}')

        # Increase the keep probability if we have not improved in args.stochastic_interval iterations
        if it - last_improvement > args.stochastic_interval:
            if keep_prob >= 1.0:
                end = True
            keep_prob = min(1.0, args.stochastic_multiplier*keep_prob)
            last_improvement = it

        # Update the embedding mapping (only affecting vectors that have dictionary mappings)
        if args.orthogonal or not end:  # orthogonal mapping
            if it == 1:
                # only initialized alignment available
                u, s, vt = xp.linalg.svd(z[trg_indices].T.dot(x[src_indices]))
            else:
                if args.align_weight == 'softmax':
                    ### TODO individualized softmax coefficients ###
                    raise 'Softmax weights not supported yet'           
                else:
                    ### TODO I'm assuming here that the alignment method is 'both', so everything's double
                    ### TODO all_coefs can be computed outside the iteration loop
                    # format: src_size_0, ..., src_size_k-1, trg_size_0, ..., trg_size_k-1
                    ncopies = args.max_align
                    cutoffs = list(range(src_size*ncopies)[::src_size]) \
                              + list(range(src_size*ncopies,(src_size+trg_size)*ncopies)[::trg_size])
                    if args.align_weight == 'rr':
                        coefs = [1. / (k+1) for k in range(ncopies)] * 2            
                    else:  # 'unit'
                        coefs = [1.] * (ncopies * 2)
                    for cf, co_s, co_e in zip(coefs, cutoffs, cutoffs[1:] + [len(all_coefs)]):
                        all_coefs[co_s:co_e] = cf
                    zr = z[trg_indices] * all_coefs
                    xr = x[src_indices] * all_coefs
                    u, s, vt = xp.linalg.svd(zr.T.dot(xr))
            w = vt.T.dot(u.T)
            x.dot(w, out=xw)
            zw[:] = z
        elif args.unconstrained:  # unconstrained mapping
            x_pseudoinv = xp.linalg.inv(x[src_indices].T.dot(x[src_indices])).dot(x[src_indices].T)
            w = x_pseudoinv.dot(z[trg_indices])
            x.dot(w, out=xw)
            zw[:] = z
        else:  # advanced mapping (default for end, acl2018)

            # remove lower-rank transformations
            midpoint = src_size * args.max_align
            src_indices = xp.concatenate((src_indices[:src_size], src_indices[midpoint:midpoint+trg_size]))
            trg_indices = xp.concatenate((trg_indices[:src_size], trg_indices[midpoint:midpoint+trg_size]))
            
            # TODO xw.dot(wx2, out=xw) and alike not working
            xw[:] = x
            zw[:] = z
            
            ### TODO entry point for adding more matrix operations ###

            # STEP 1: Whitening
            ### TODO figure out how weighted k-best affects this (and onwards) ###
            def whitening_transformation(m):
                u, s, vt = xp.linalg.svd(m, full_matrices=False)
                return vt.T.dot(xp.diag(1/s)).dot(vt)
            if args.whiten:
                wx1 = whitening_transformation(xw[src_indices])
                wz1 = whitening_transformation(zw[trg_indices])
                xw = xw.dot(wx1)
                zw = zw.dot(wz1)

            # STEP 2: Orthogonal mapping
            wx2, s, wz2_t = xp.linalg.svd(xw[src_indices].T.dot(zw[trg_indices]))
            wz2 = wz2_t.T
            xw = xw.dot(wx2)
            zw = zw.dot(wz2)

            # STEP 3: Re-weighting
            xw *= s**args.src_reweight
            zw *= s**args.trg_reweight

            # STEP 4: De-whitening
            if args.src_dewhiten == 'src':
                xw = xw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2))
            elif args.src_dewhiten == 'trg':
                xw = xw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2))
            if args.trg_dewhiten == 'src':
                zw = zw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2))
            elif args.trg_dewhiten == 'trg':
                zw = zw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2))

            # STEP 5: Dimensionality reduction (default: OFF (0))
            if args.dim_reduction > 0:
                xw = xw[:, :args.dim_reduction]
                zw = zw[:, :args.dim_reduction]

        # Self-learning
        if end:
            break
        else:
            # Update the training dictionary (default direction - union)
            if args.direction in ('forward', 'union'):
                if args.csls_neighborhood > 0:  # default acl2018: 10
                    for i in range(0, trg_size, simbwd.shape[0]):
                        j = min(i + simbwd.shape[0], trg_size)  # get next batch to operate on
                        zw[i:j].dot(xw[:src_size].T, out=simbwd[:j-i])
                        knn_sim_bwd[i:j] = topk_mean(simbwd[:j-i], k=args.csls_neighborhood, inplace=True)
                for i in range(0, src_size, simfwd.shape[0]):
                    j = min(i + simfwd.shape[0], src_size)
                    xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j-i])
                    simfwd[:j-i].max(axis=1, out=best_sim_forward[i:j])
                    simfwd[:j-i] -= knn_sim_bwd/2  # Equivalent to the real CSLS scores for NN
                    
                    # softmaxing
                    #argsimsf[:] = dropout(-simfwd[:j-i], 1 - keep_prob).argsort(axis=1)[:,:args.max_align]
                    for k in range(args.max_align):
                        argsimsf = dropout(simfwd[:j-i], 1 - keep_prob).argmax(axis=1)
                        simfwd[:j-i,argsimsf] = -200
                        trg_indices_forward[(k*src_size)+i:(k*src_size)+j] = argsimsf
                        #trg_indices_forward[(k*src_size)+i:(k*src_size)+j] = argsimsf[:,k]
            if args.direction in ('backward', 'union'):
                if args.csls_neighborhood > 0:
                    for i in range(0, src_size, simfwd.shape[0]):
                        j = min(i + simfwd.shape[0], src_size)  # get next batch to operate on
                        xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j-i])
                        knn_sim_fwd[i:j] = topk_mean(simfwd[:j-i], k=args.csls_neighborhood, inplace=True)
                for i in range(0, trg_size, simbwd.shape[0]):
                    j = min(i + simbwd.shape[0], trg_size)
                    zw[i:j].dot(xw[:src_size].T, out=simbwd[:j-i])
                    simbwd[:j-i].max(axis=1, out=best_sim_backward[i:j])
                    simbwd[:j-i] -= knn_sim_fwd/2  # Equivalent to the real CSLS scores for NN
                    
                    # softmaxing
                    #argsimsb[:] = dropout(-simbwd[:j-i], 1 - keep_prob).argsort(axis=1)[:,:args.max_align]
                    for k in range(args.max_align):
                        argsimsb = dropout(simbwd[:j-i], 1 - keep_prob).argmax(axis=1)
                        simbwd[:j-i,argsimsb] = -200
                        trg_indices_backward[(k*trg_size)+i:(k*trg_size)+j] = argsimsb
                        #src_indices_backward[(k*trg_size)+i:(k*trg_size)+j] = argsimsb[:,k]
            if args.direction == 'forward':
                src_indices = src_indices_forward
                trg_indices = trg_indices_forward
            elif args.direction == 'backward':
                src_indices = src_indices_backward
                trg_indices = trg_indices_backward
            elif args.direction == 'union':
                src_indices = xp.concatenate((src_indices_forward, src_indices_backward))
                trg_indices = xp.concatenate((trg_indices_forward, trg_indices_backward))

            # Objective function evaluation
            if args.direction == 'forward':
                objective = xp.mean(best_sim_forward).tolist()
            elif args.direction == 'backward':
                objective = xp.mean(best_sim_backward).tolist()
            elif args.direction == 'union':  # default
                objective = (xp.mean(best_sim_forward) + xp.mean(best_sim_backward)).tolist() / 2
            if objective - best_objective >= args.threshold:
                last_improvement = it
                best_objective = objective

            # Accuracy and similarity evaluation in validation (default - off)
            if args.validation is not None:
                src = list(validation.keys())
                xw[src].dot(zw.T, out=simval)
                nn = asnumpy(simval.argmax(axis=1))
                accuracy = np.mean([1 if nn[i] in validation[src[i]] else 0 for i in range(len(src))])
                similarity = np.mean([max([simval[i, j].tolist() for j in validation[src[i]]]) for i in range(len(src))])

            # Logging
            duration = time.time() - t
            if args.verbose:
                print(file=sys.stderr)
                print('ITERATION {0} ({1:.2f}s)'.format(it, duration), file=sys.stderr)
                print('\t- Objective:        {0:9.4f}%'.format(100 * objective), file=sys.stderr)
                print('\t- Drop probability: {0:9.4f}%'.format(100 - 100*keep_prob), file=sys.stderr)
                if args.validation is not None:
                    print('\t- Val. similarity:  {0:9.4f}%'.format(100 * similarity), file=sys.stderr)
                    print('\t- Val. accuracy:    {0:9.4f}%'.format(100 * accuracy), file=sys.stderr)
                    print('\t- Val. coverage:    {0:9.4f}%'.format(100 * validation_coverage), file=sys.stderr)
                sys.stderr.flush()
            if args.log is not None:
                val = '{0:.6f}\t{1:.6f}\t{2:.6f}'.format(
                    100 * similarity, 100 * accuracy, 100 * validation_coverage) if args.validation is not None else ''
                print('{0}\t{1:.6f}\t{2}\t{3:.6f}'.format(it, 100 * objective, val, duration), file=log)
                log.flush()

        t = time.time()
        it += 1

    # Write mapped embeddings
    srcfile = open(args.src_output, mode='w', encoding=args.encoding, errors='surrogateescape')
    trgfile = open(args.trg_output, mode='w', encoding=args.encoding, errors='surrogateescape')
    embeddings.write(src_words, xw, srcfile)
    embeddings.write(trg_words, zw, trgfile)
    srcfile.close()
    trgfile.close()
    
    # Write dictionary
    dictfile = open(args.dict_output, mode='wb')
    dictalign = list(zip(src_indices, trg_indices))
    pickle.dump(dictalign, dictfile)
예제 #4
0
	#in sentence_file or '5-15' in sentence_file:
	batch_num = 0
	with open(sentence_file, 'r', encoding='utf-8') as fr:
		print('Processing file', sentence_file, '...')
		p = hnswlib.Index(space='cosine', dim=dimension)
		p.init_index(max_elements = num_elements, ef_construction = 2000, M = 80)
		p.set_ef(1000)
		# Set number of threads used during batch search/construction
		# By default using all available cores
		p.set_num_threads(30)
		for n_lines in iter(lambda: tuple(islice(fr, batch_size)), ()):
			sents = list(map(str.strip, n_lines))
			sent_id = list(map(lambda x:int(x.split('\t')[0]), sents))
			sentences = list(map(lambda x:x.split('\t')[-1], sents))
			x, m = data_io.sentences2idx(sentences, words)
			w = data_io.seq2weight(x, m, weight4ind)

			# get SIF embedding
			embedding = SIF_embedding.SIF_embedding(We, x, w, params) # embedding[i,:] is the embedding for sentence i
			embeddings.normalize(embedding, ["unit", "center"])

			p.add_items(embedding,sent_id)
			print('Finished batch', batch_num, '.', end = '\r')
			batch_num += 1
	print('\nFinished loading', sentence_file, '.')
	out_file = sentence_file+'.ann'
	p.save_index(out_file)
	print('Finished saving', out_file, '.')
	del p

예제 #5
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(description='Map word embeddings in two languages into a shared space')
    parser.add_argument('src_input', help='the input source embeddings')
    parser.add_argument('trg_input', help='the input target embeddings')
    parser.add_argument('src_output', help='the output source embeddings')
    parser.add_argument('trg_output', help='the output target embeddings')
    parser.add_argument('--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)')
    parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp32', help='the floating-point precision (defaults to fp32)')
    parser.add_argument('--batch_size', default=10000, type=int, help='batch size (defaults to 10000); does not affect results, larger is usually faster but uses more memory')
    parser.add_argument('--seed', type=int, default=0, help='the random seed (defaults to 0)')

    recommended_group = parser.add_argument_group('recommended settings', 'Recommended settings for different scenarios')
    recommended_type = recommended_group.add_mutually_exclusive_group()
    recommended_type.add_argument('--supervised', metavar='DICTIONARY', help='recommended if you have a large training dictionary')
    recommended_type.add_argument('--semi_supervised', metavar='DICTIONARY', help='recommended if you have a small seed dictionary')
    recommended_type.add_argument('--identical', action='store_true', help='recommended if you have no seed dictionary but can rely on identical words')
    recommended_type.add_argument('--unsupervised', action='store_true', help='recommended if you have no seed dictionary and do not want to rely on identical words')

    init_group = parser.add_argument_group('advanced initialization arguments', 'Advanced initialization arguments')
    init_type = init_group.add_mutually_exclusive_group()
    init_type.add_argument('-d', '--init_dictionary', default=sys.stdin.fileno(), metavar='DICTIONARY', help='the training dictionary file (defaults to stdin)')
    init_type.add_argument('--init_identical', action='store_true', help='use identical words as the seed dictionary')
    init_type.add_argument('--init_numerals', action='store_true', help='use latin numerals (i.e. words matching [0-9]+) as the seed dictionary')
    init_type.add_argument('--init_unsupervised', action='store_true', help='use unsupervised initialization')
    init_group.add_argument('--unsupervised_vocab', type=int, default=0, help='restrict the vocabulary to the top k entries for unsupervised initialization')

    mapping_group = parser.add_argument_group('advanced mapping arguments', 'Advanced embedding mapping arguments')
    mapping_group.add_argument('--normalize', choices=['unit', 'center', 'unitdim', 'centeremb', 'none'], nargs='*', default=[], help='the normalization actions to perform in order')
    mapping_group.add_argument('--whiten', action='store_true', help='whiten the embeddings')
    mapping_group.add_argument('--src_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the source language embeddings')
    mapping_group.add_argument('--trg_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the target language embeddings')
    mapping_group.add_argument('--src_dewhiten', choices=['src', 'trg'], help='de-whiten the source language embeddings')
    mapping_group.add_argument('--trg_dewhiten', choices=['src', 'trg'], help='de-whiten the target language embeddings')
    mapping_group.add_argument('--dim_reduction', type=int, default=0, help='apply dimensionality reduction')
    mapping_type = mapping_group.add_mutually_exclusive_group()
    mapping_type.add_argument('-c', '--orthogonal', action='store_true', help='use orthogonal constrained mapping')
    mapping_type.add_argument('-u', '--unconstrained', action='store_true', help='use unconstrained mapping')

    self_learning_group = parser.add_argument_group('advanced self-learning arguments', 'Advanced arguments for self-learning')
    self_learning_group.add_argument('--self_learning', action='store_true', help='enable self-learning')
    self_learning_group.add_argument('--vocabulary_cutoff', type=int, default=0, help='restrict the vocabulary to the top k entries')
    self_learning_group.add_argument('--direction', choices=['forward', 'backward', 'union'], default='union', help='the direction for dictionary induction (defaults to union)')
    self_learning_group.add_argument('--csls', type=int, nargs='?', default=0, const=10, metavar='NEIGHBORHOOD_SIZE', dest='csls_neighborhood', help='use CSLS for dictionary induction')
    self_learning_group.add_argument('--threshold', default=0.000001, type=float, help='the convergence threshold (defaults to 0.000001)')
    self_learning_group.add_argument('--validation', default=None, metavar='DICTIONARY', help='a dictionary file for validation at each iteration')
    self_learning_group.add_argument('--stochastic_initial', default=0.1, type=float, help='initial keep probability stochastic dictionary induction (defaults to 0.1)')
    self_learning_group.add_argument('--stochastic_multiplier', default=2.0, type=float, help='stochastic dictionary induction multiplier (defaults to 2.0)')
    self_learning_group.add_argument('--stochastic_interval', default=50, type=int, help='stochastic dictionary induction interval (defaults to 50)')
    self_learning_group.add_argument('--log', help='write to a log file in tsv format at each iteration')
    self_learning_group.add_argument('-v', '--verbose', action='store_true', help='write log information to stderr at each iteration')
    args = parser.parse_args()

    if args.supervised is not None:
        parser.set_defaults(init_dictionary=args.supervised, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', batch_size=1000)
    if args.semi_supervised is not None:
        parser.set_defaults(init_dictionary=args.semi_supervised, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10)
    if args.identical:
        parser.set_defaults(init_identical=True, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10)
    if args.unsupervised:
        parser.set_defaults(init_unsupervised=True, unsupervised_vocab=4000, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10)
    args = parser.parse_args()

    # Check command line arguments
    if (args.src_dewhiten is not None or args.trg_dewhiten is not None) and not args.whiten:
        print('ERROR: De-whitening requires whitening first', file=sys.stderr)
        sys.exit(-1)

    # Choose the right dtype for the desired precision
    if args.precision == 'fp16':
        dtype = 'float16'
    elif args.precision == 'fp32':
        dtype = 'float32'
    elif args.precision == 'fp64':
        dtype = 'float64'

    # Read input embeddings
    srcfile = open(args.src_input, encoding=args.encoding, errors='surrogateescape')
    trgfile = open(args.trg_input, encoding=args.encoding, errors='surrogateescape')
    src_words, x = embeddings.read(srcfile, dtype=dtype)
    trg_words, z = embeddings.read(trgfile, dtype=dtype)

    np.random.seed(args.seed)

    # Build word to index map
    src_word2ind = {word: i for i, word in enumerate(src_words)}
    trg_word2ind = {word: i for i, word in enumerate(trg_words)}

    # STEP 0: Normalization
    embeddings.normalize(x, args.normalize)
    embeddings.normalize(z, args.normalize)

    # Build the seed dictionary
    src_indices = []
    trg_indices = []
    if args.init_unsupervised:
        sim_size = min(x.shape[0], z.shape[0]) if args.unsupervised_vocab <= 0 else min(x.shape[0], z.shape[0], args.unsupervised_vocab)
        u, s, vt = np.linalg.svd(x[:sim_size], full_matrices=False)
        xsim = (u*s).dot(u.T)
        u, s, vt = np.linalg.svd(z[:sim_size], full_matrices=False)
        zsim = (u*s).dot(u.T)
        del u, s, vt
        xsim.sort(axis=1)
        zsim.sort(axis=1)
        embeddings.normalize(xsim, args.normalize)
        embeddings.normalize(zsim, args.normalize)
        sim = xsim.dot(zsim.T)
        if args.csls_neighborhood > 0:
            knn_sim_fwd = topk_mean(sim, k=args.csls_neighborhood)
            knn_sim_bwd = topk_mean(sim.T, k=args.csls_neighborhood)
            sim -= knn_sim_fwd[:, np.newaxis]/2 + knn_sim_bwd/2
        if args.direction == 'forward':
            src_indices = np.arange(sim_size)
            trg_indices = sim.argmax(axis=1)
        elif args.direction == 'backward':
            src_indices = sim.argmax(axis=0)
            trg_indices = np.arange(sim_size)
        elif args.direction == 'union':
            src_indices = np.concatenate((np.arange(sim_size), sim.argmax(axis=0)))
            trg_indices = np.concatenate((sim.argmax(axis=1), np.arange(sim_size)))
        del xsim, zsim, sim
    elif args.init_numerals:
        numeral_regex = re.compile('^[0-9]+$')
        src_numerals = {word for word in src_words if numeral_regex.match(word) is not None}
        trg_numerals = {word for word in trg_words if numeral_regex.match(word) is not None}
        numerals = src_numerals.intersection(trg_numerals)
        for word in numerals:
            src_indices.append(src_word2ind[word])
            trg_indices.append(trg_word2ind[word])
    elif args.init_identical:
        identical = set(src_words).intersection(set(trg_words))
        for word in identical:
            src_indices.append(src_word2ind[word])
            trg_indices.append(trg_word2ind[word])
    else:
        f = open(args.init_dictionary, encoding=args.encoding, errors='surrogateescape')
        for line in f:
            src, trg = line.split()
            try:
                src_ind = src_word2ind[src]
                trg_ind = trg_word2ind[trg]
                src_indices.append(src_ind)
                trg_indices.append(trg_ind)
            except KeyError:
                print('WARNING: OOV dictionary entry ({0} - {1})'.format(src, trg), file=sys.stderr)

    # Read validation dictionary
    if args.validation is not None:
        f = open(args.validation, encoding=args.encoding, errors='surrogateescape')
        validation = collections.defaultdict(set)
        oov = set()
        vocab = set()
        for line in f:
            src, trg = line.split()
            try:
                src_ind = src_word2ind[src]
                trg_ind = trg_word2ind[trg]
                validation[src_ind].add(trg_ind)
                vocab.add(src)
            except KeyError:
                oov.add(src)
        oov -= vocab  # If one of the translation options is in the vocabulary, then the entry is not an oov
        validation_coverage = len(validation) / (len(validation) + len(oov))

    # Create log file
    if args.log:
        log = open(args.log, mode='w', encoding=args.encoding, errors='surrogateescape')

    # Allocate memory
    xw = np.empty_like(x)
    zw = np.empty_like(z)
    src_size = x.shape[0] if args.vocabulary_cutoff <= 0 else min(x.shape[0], args.vocabulary_cutoff)
    trg_size = z.shape[0] if args.vocabulary_cutoff <= 0 else min(z.shape[0], args.vocabulary_cutoff)
    simfwd = np.empty((args.batch_size, trg_size), dtype=dtype)
    simbwd = np.empty((args.batch_size, src_size), dtype=dtype)
    if args.validation is not None:
        simval = np.empty((len(validation.keys()), z.shape[0]), dtype=dtype)

    best_sim_forward = np.full(src_size, -100, dtype=dtype)
    src_indices_forward = np.arange(src_size)
    trg_indices_forward = np.zeros(src_size, dtype=int)
    best_sim_backward = np.full(trg_size, -100, dtype=dtype)
    src_indices_backward = np.zeros(trg_size, dtype=int)
    trg_indices_backward = np.arange(trg_size)
    knn_sim_fwd = np.zeros(src_size, dtype=dtype)
    knn_sim_bwd = np.zeros(trg_size, dtype=dtype)

    # Training loop
    best_objective = objective = -100.
    it = 1
    last_improvement = 0
    keep_prob = args.stochastic_initial
    t = time.time()
    end = not args.self_learning
    while True:

        # Increase the keep probability if we have not improve in args.stochastic_interval iterations
        if it - last_improvement > args.stochastic_interval:
            if keep_prob >= 1.0:
                end = True
            keep_prob = min(1.0, args.stochastic_multiplier*keep_prob)
            last_improvement = it

        # Update the embedding mapping
        if args.orthogonal or not end:  # orthogonal mapping
            u, s, vt = np.linalg.svd(z[trg_indices].T.dot(x[src_indices]))
            w = vt.T.dot(u.T)
            x.dot(w, out=xw)
            zw[:] = z
        elif args.unconstrained:  # unconstrained mapping
            x_pseudoinv = np.linalg.inv(x[src_indices].T.dot(x[src_indices])).dot(x[src_indices].T)
            w = x_pseudoinv.dot(z[trg_indices])
            x.dot(w, out=xw)
            zw[:] = z
        else:  # advanced mapping

            # TODO xw.dot(wx2, out=xw) and alike not working
            xw[:] = x
            zw[:] = z

            # STEP 1: Whitening
            def whitening_transformation(m):
                u, s, vt = np.linalg.svd(m, full_matrices=False)
                return vt.T.dot(np.diag(1/s)).dot(vt)
            if args.whiten:
                wx1 = whitening_transformation(xw[src_indices])
                wz1 = whitening_transformation(zw[trg_indices])
                xw = xw.dot(wx1)
                zw = zw.dot(wz1)

            # STEP 2: Orthogonal mapping
            wx2, s, wz2_t = np.linalg.svd(xw[src_indices].T.dot(zw[trg_indices]))
            wz2 = wz2_t.T
            xw = xw.dot(wx2)
            zw = zw.dot(wz2)

            # STEP 3: Re-weighting
            xw *= s**args.src_reweight
            zw *= s**args.trg_reweight

            # STEP 4: De-whitening
            if args.src_dewhiten == 'src':
                xw = xw.dot(wx2.T.dot(np.linalg.inv(wx1)).dot(wx2))
            elif args.src_dewhiten == 'trg':
                xw = xw.dot(wz2.T.dot(np.linalg.inv(wz1)).dot(wz2))
            if args.trg_dewhiten == 'src':
                zw = zw.dot(wx2.T.dot(np.linalg.inv(wx1)).dot(wx2))
            elif args.trg_dewhiten == 'trg':
                zw = zw.dot(wz2.T.dot(np.linalg.inv(wz1)).dot(wz2))

            # STEP 5: Dimensionality reduction
            if args.dim_reduction > 0:
                xw = xw[:, :args.dim_reduction]
                zw = zw[:, :args.dim_reduction]

        # Self-learning
        if end:
            break
        else:
            # Update the training dictionary
            if args.direction in ('forward', 'union'):
                if args.csls_neighborhood > 0:
                    for i in range(0, trg_size, simbwd.shape[0]):
                        j = min(i + simbwd.shape[0], trg_size)
                        zw[i:j].dot(xw[:src_size].T, out=simbwd[:j-i])
                        knn_sim_bwd[i:j] = topk_mean(simbwd[:j-i], k=args.csls_neighborhood, inplace=True)
                for i in range(0, src_size, simfwd.shape[0]):
                    j = min(i + simfwd.shape[0], src_size)
                    xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j-i])
                    simfwd[:j-i].max(axis=1, out=best_sim_forward[i:j])
                    simfwd[:j-i] -= knn_sim_bwd/2  # Equivalent to the real CSLS scores for NN
                    dropout(simfwd[:j-i], 1 - keep_prob).argmax(axis=1, out=trg_indices_forward[i:j])
            if args.direction in ('backward', 'union'):
                if args.csls_neighborhood > 0:
                    for i in range(0, src_size, simfwd.shape[0]):
                        j = min(i + simfwd.shape[0], src_size)
                        xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j-i])
                        knn_sim_fwd[i:j] = topk_mean(simfwd[:j-i], k=args.csls_neighborhood, inplace=True)
                for i in range(0, trg_size, simbwd.shape[0]):
                    j = min(i + simbwd.shape[0], trg_size)
                    zw[i:j].dot(xw[:src_size].T, out=simbwd[:j-i])
                    simbwd[:j-i].max(axis=1, out=best_sim_backward[i:j])
                    simbwd[:j-i] -= knn_sim_fwd/2  # Equivalent to the real CSLS scores for NN
                    dropout(simbwd[:j-i], 1 - keep_prob).argmax(axis=1, out=src_indices_backward[i:j])
            if args.direction == 'forward':
                src_indices = src_indices_forward
                trg_indices = trg_indices_forward
            elif args.direction == 'backward':
                src_indices = src_indices_backward
                trg_indices = trg_indices_backward
            elif args.direction == 'union':
                src_indices = np.concatenate((src_indices_forward, src_indices_backward))
                trg_indices = np.concatenate((trg_indices_forward, trg_indices_backward))

            # Objective function evaluation
            if args.direction == 'forward':
                objective = np.mean(best_sim_forward).tolist()
            elif args.direction == 'backward':
                objective = np.mean(best_sim_backward).tolist()
            elif args.direction == 'union':
                objective = (np.mean(best_sim_forward) + np.mean(best_sim_backward)).tolist() / 2
            if objective - best_objective >= args.threshold:
                last_improvement = it
                best_objective = objective

            # Accuracy and similarity evaluation in validation
            if args.validation is not None:
                src = list(validation.keys())
                xw[src].dot(zw.T, out=simval)
                nn = asnumpy(simval.argmax(axis=1))
                accuracy = np.mean([1 if nn[i] in validation[src[i]] else 0 for i in range(len(src))])
                similarity = np.mean([max([simval[i, j].tolist() for j in validation[src[i]]]) for i in range(len(src))])

            # Logging
            duration = time.time() - t
            if args.verbose:
                print(file=sys.stderr)
                print('ITERATION {0} ({1:.2f}s)'.format(it, duration), file=sys.stderr)
                print('\t- Objective:        {0:9.4f}%'.format(100 * objective), file=sys.stderr)
                print('\t- Drop probability: {0:9.4f}%'.format(100 - 100*keep_prob), file=sys.stderr)
                if args.validation is not None:
                    print('\t- Val. similarity:  {0:9.4f}%'.format(100 * similarity), file=sys.stderr)
                    print('\t- Val. accuracy:    {0:9.4f}%'.format(100 * accuracy), file=sys.stderr)
                    print('\t- Val. coverage:    {0:9.4f}%'.format(100 * validation_coverage), file=sys.stderr)
                sys.stderr.flush()
            if args.log is not None:
                val = '{0:.6f}\t{1:.6f}\t{2:.6f}'.format(
                    100 * similarity, 100 * accuracy, 100 * validation_coverage) if args.validation is not None else ''
                print('{0}\t{1:.6f}\t{2}\t{3:.6f}'.format(it, 100 * objective, val, duration), file=log)
                log.flush()

        t = time.time()
        it += 1

    # Write mapped embeddings
    srcfile = open(args.src_output, mode='w', encoding=args.encoding, errors='surrogateescape')
    trgfile = open(args.trg_output, mode='w', encoding=args.encoding, errors='surrogateescape')
    embeddings.write(src_words, xw, srcfile)
    embeddings.write(trg_words, zw, trgfile)
    srcfile.close()
    trgfile.close()
예제 #6
0
        dtype = 'float32'
    elif args.precision == 'fp64':
        dtype = 'float64'

    ''' supervised approach parameter '''
    init_dictionary=args.supervised
    normalize=['unit', 'center', 'unit']
    whiten=True
    src_reweight=0.5
    trg_reweight=0.5
    src_dewhiten='src'
    trg_dewhiten='trg'
    batch_size=1000

    # STEP 0: Normalization
    embeddings.normalize(x, args.normalize)
    embeddings.normalize(z, args.normalize)

    # Build the seed dictionary
    src_indices = []
    trg_indices = []
	f = open(args.init_dictionary, encoding=args.encoding, errors='surrogateescape')
        for line in f:
            src, trg = line.split()
            try:
                src_ind = src_word2ind[src]
                trg_ind = trg_word2ind[trg]
                src_indices.append(src_ind)
                trg_indices.append(trg_ind)
            except KeyError:
                print('WARNING: OOV dictionary entry ({0} - {1})'.format(src, trg), file=sys.stderr)
예제 #7
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(
        description='Map word embeddings in two languages into a shared space')
    parser.add_argument('src_input', help='the input source embeddings')
    parser.add_argument('trg_input', help='the input target embeddings')
    parser.add_argument('sense_input', help='the input sense mapping matrix')
    parser.add_argument('src_output', help='the output source embeddings')
    parser.add_argument('trg_output', help='the output target embeddings')
    parser.add_argument('tsns_output',
                        default='tsns.pkl',
                        help='the output target senses pickle file')
    parser.add_argument(
        '--encoding',
        default='utf-8',
        help='the character encoding for input/output (defaults to utf-8)')
    parser.add_argument('--precision',
                        choices=['fp16', 'fp32', 'fp64'],
                        default='fp32',
                        help='the floating-point precision (defaults to fp32)')
    parser.add_argument('--cuda',
                        action='store_true',
                        help='use cuda (requires cupy)')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='the random seed (defaults to 0)')

    recommended_group = parser.add_argument_group(
        'recommended settings', 'Recommended settings for different scenarios')
    recommended_type = recommended_group.add_mutually_exclusive_group()
    recommended_type.add_argument(
        '--unsupervised',
        action='store_true',
        help=
        'recommended if you have no seed dictionary and do not want to rely on identical words'
    )
    recommended_type.add_argument('--future',
                                  action='store_true',
                                  help='experiment with stuff')
    recommended_type.add_argument('--toy',
                                  action='store_true',
                                  help='experiment with stuff on toy dataset')
    recommended_type.add_argument('--acl2018',
                                  action='store_true',
                                  help='reproduce our ACL 2018 system')

    init_group = parser.add_argument_group(
        'advanced initialization arguments',
        'Advanced initialization arguments')
    init_type = init_group.add_mutually_exclusive_group()
    init_type.add_argument('--init_unsupervised',
                           action='store_true',
                           help='use unsupervised initialization')
    init_group.add_argument(
        '--unsupervised_vocab',
        type=int,
        default=0,
        help=
        'restrict the vocabulary to the top k entries for unsupervised initialization'
    )

    mapping_group = parser.add_argument_group(
        'advanced mapping arguments', 'Advanced embedding mapping arguments')
    mapping_group.add_argument(
        '--normalize',
        choices=['unit', 'center', 'unitdim', 'centeremb', 'none'],
        nargs='*',
        default=[],
        help='the normalization actions to perform in order')
    mapping_group.add_argument('--whiten',
                               action='store_true',
                               help='whiten the embeddings')
    mapping_group.add_argument('--src_reweight',
                               type=float,
                               default=0,
                               nargs='?',
                               const=1,
                               help='re-weight the source language embeddings')
    mapping_group.add_argument('--trg_reweight',
                               type=float,
                               default=0,
                               nargs='?',
                               const=1,
                               help='re-weight the target language embeddings')
    mapping_group.add_argument('--src_dewhiten',
                               choices=['src', 'trg'],
                               help='de-whiten the source language embeddings')
    mapping_group.add_argument('--trg_dewhiten',
                               choices=['src', 'trg'],
                               help='de-whiten the target language embeddings')
    mapping_group.add_argument('--dim_reduction',
                               type=int,
                               default=0,
                               help='apply dimensionality reduction')
    mapping_type = mapping_group.add_mutually_exclusive_group()
    mapping_type.add_argument('-c',
                              '--orthogonal',
                              action='store_true',
                              help='use orthogonal constrained mapping')

    self_learning_group = parser.add_argument_group(
        'advanced self-learning arguments',
        'Advanced arguments for self-learning')
    self_learning_group.add_argument(
        '--vocabulary_cutoff',
        type=int,
        default=0,
        help='restrict the vocabulary to the top k entries')
    self_learning_group.add_argument(
        '--threshold',
        default=0.000001,
        type=float,
        help='the convergence threshold (defaults to 0.000001)')
    self_learning_group.add_argument(
        '--stochastic_initial',
        default=0.1,
        type=float,
        help=
        'initial keep probability stochastic dictionary induction (defaults to 0.1)'
    )
    self_learning_group.add_argument(
        '--stochastic_multiplier',
        default=2.0,
        type=float,
        help='stochastic dictionary induction multiplier (defaults to 2.0)')
    self_learning_group.add_argument(
        '--stochastic_interval',
        default=50,
        type=int,
        help='stochastic dictionary induction interval (defaults to 50)')
    self_learning_group.add_argument(
        '--log',
        default='map.log',
        help='write to a log file in tsv format at each iteration')
    self_learning_group.add_argument(
        '-v',
        '--verbose',
        action='store_true',
        help='write log information to stderr at each iteration')

    future_group = parser.add_argument_group('experimental arguments',
                                             'Experimental arguments')
    future_group.add_argument('--skip_top',
                              type=int,
                              default=0,
                              help='Top k words to skip, presumably function')
    future_group.add_argument(
        '--start_src',
        action='store_true',
        help='Algorithm starts by tuning sense embeddings based on source')
    future_group.add_argument('--trim_senses',
                              action='store_true',
                              help='Trim sense table to working vocab')
    future_group.add_argument(
        '--lamb',
        type=float,
        default=0.5,
        help='Weight hyperparameter for sense alignment objectives')
    future_group.add_argument('--reglamb',
                              type=float,
                              default=1.,
                              help='Lasso regularization hyperparameter')
    future_group.add_argument(
        '--ccreglamb',
        type=float,
        default=0.1,
        help='Sense embedding regularization hyperparameter')
    future_group.add_argument('--inv_delta',
                              type=float,
                              default=0.0001,
                              help='Delta_I added for inverting sense matrix')
    future_group.add_argument('--lasso_iters',
                              type=int,
                              default=10,
                              help='Number of iterations for LASSO/NMF')
    future_group.add_argument('--iterations',
                              type=int,
                              default=-1,
                              help='Number of overall model iterations')
    future_group.add_argument('--trg_batch',
                              type=int,
                              default=5000,
                              help='Batch size for target steps')
    future_group.add_argument(
        '--trg_knn',
        action='store_true',
        help='Perform target sense mapping by k-nearest neighbors')
    future_group.add_argument(
        '--trg_sns_csls',
        type=int,
        default=10,
        help='K-nearest neighbors for CSLS target sense search')
    future_group.add_argument(
        '--senses_per_trg',
        type=int,
        default=1,
        help='K-max target sense mapping (default = 1 = off)')
    future_group.add_argument(
        '--gd',
        action='store_true',
        help='Apply gradient descent for assignment and synset embeddings')
    future_group.add_argument('--gd_lr',
                              type=float,
                              default=1e-2,
                              help='Learning rate for SGD (default=0.01)')
    future_group.add_argument('--gd_wd',
                              action='store_true',
                              help='Weight decay in SGD')
    future_group.add_argument(
        '--gd_wd_hl',
        type=int,
        default=100,
        help='Weight decay half-life in SGD, default=100')
    future_group.add_argument(
        '--gd_clip',
        type=float,
        default=5.,
        help='Per-coordinate gradient clipping (default=5)')
    future_group.add_argument(
        '--gd_map_steps',
        type=int,
        default=1,
        help='Consecutive steps for each target-sense mapping update phase')
    future_group.add_argument(
        '--gd_emb_steps',
        type=int,
        default=1,
        help='Consecutive steps for each sense embedding update phase')
    future_group.add_argument(
        '--base_prox_lambda',
        type=float,
        default=0.99,
        help='Lambda for proximal gradient in lasso step')
    future_group.add_argument(
        '--prox_decay',
        action='store_true',
        help='Multiply proximal lambda by itself each iteration')
    future_group.add_argument(
        '--sense_limit',
        type=float,
        default=1.1,
        help=
        'Maximum amount of target sense mappings, in terms of source mappings (default=1.1x)'
    )
    future_group.add_argument(
        '--gold_pairs',
        help='Gold data for evaluation, if exists (not for tuning)')
    future_group.add_argument(
        '--gold_threshold',
        type=float,
        default=0.0,
        help='Threshold for gold mapping (0 is fine if sparse)')

    future_group.add_argument('--debug', action='store_true')

    args = parser.parse_args()

    # pre-setting groups
    if args.toy:
        parser.set_defaults(init_unsupervised=True,
                            unsupervised_vocab=4000,
                            normalize=['unit', 'center', 'unit'],
                            whiten=True,
                            src_reweight=0.5,
                            trg_reweight=0.5,
                            src_dewhiten='src',
                            trg_dewhiten='trg',
                            vocabulary_cutoff=50,
                            trim_senses=True,
                            inv_delta=1.,
                            reglamb=0.2,
                            lasso_iters=100,
                            gd_wd=True,
                            log='map-toy.log')
    if args.unsupervised or args.future:
        parser.set_defaults(init_unsupervised=True,
                            unsupervised_vocab=4000,
                            normalize=['unit', 'center', 'unit'],
                            whiten=True,
                            src_reweight=0.5,
                            trg_reweight=0.5,
                            src_dewhiten='src',
                            trg_dewhiten='trg',
                            vocabulary_cutoff=2000,
                            trim_senses=True,
                            gd_wd=True)
    if args.unsupervised or args.acl2018:
        parser.set_defaults(init_unsupervised=True,
                            unsupervised_vocab=4000,
                            normalize=['unit', 'center', 'unit'],
                            whiten=True,
                            src_reweight=0.5,
                            trg_reweight=0.5,
                            src_dewhiten='src',
                            trg_dewhiten='trg',
                            vocabulary_cutoff=20000)
    args = parser.parse_args()

    # Check command line arguments
    if (args.src_dewhiten is not None
            or args.trg_dewhiten is not None) and not args.whiten:
        print('ERROR: De-whitening requires whitening first', file=sys.stderr)
        sys.exit(-1)

    # Choose the right dtype for the desired precision
    if args.precision == 'fp16':
        dtype = 'float16'  # many operations not supported by cupy
    elif args.precision == 'fp32':  # default
        dtype = 'float32'
    elif args.precision == 'fp64':
        dtype = 'float64'

    # Read input embeddings
    print('reading embeddings...')
    srcfile = open(args.src_input,
                   encoding=args.encoding,
                   errors='surrogateescape')
    trgfile = open(args.trg_input,
                   encoding=args.encoding,
                   errors='surrogateescape')
    src_words, x = embeddings.read(srcfile, dtype=dtype)
    trg_words, z = embeddings.read(trgfile, dtype=dtype)
    print('embeddings read')

    # Read input source sense mapping
    print('reading sense mapping')
    src_senses = pickle.load(open(args.sense_input, 'rb'))
    if src_senses.shape[0] != x.shape[0]:
        src_senses = csr_matrix(src_senses.transpose()
                                )  # using non-cuda scipy because of 'inv' impl
    #src_senses = get_sparse_module(src_senses)
    print(
        f'source sense mapping of shape {src_senses.shape} loaded with {src_senses.getnnz()} nonzeros'
    )

    # NumPy/CuPy management
    if args.cuda:
        if not supports_cupy():
            print('ERROR: Install CuPy for CUDA support', file=sys.stderr)
            sys.exit(-1)
        xp = get_cupy()
        x = xp.asarray(x)
        z = xp.asarray(z)
        print('CUDA loaded')
    else:
        xp = np
    xp.random.seed(args.seed)

    # removed word to index map (only relevant in supervised learning or with validation)

    # STEP 0: Normalization
    embeddings.normalize(x, args.normalize)
    embeddings.normalize(z, args.normalize)
    print('normalization complete')

    # removed building the seed dictionary

    # removed validation step

    # Create log file
    if args.log:
        log = open(args.log,
                   mode='w',
                   encoding=args.encoding,
                   errors='surrogateescape')
        print(f'logging into {args.log}')

    # Allocate memory

    # Initialize the projection matrices W(s) = W(t) = I.
    xw = xp.empty_like(x)
    zw = xp.empty_like(z)
    xw[:] = x
    zw[:] = z

    src_size = x.shape[0] if args.vocabulary_cutoff <= 0 else min(
        x.shape[0] - args.skip_top, args.vocabulary_cutoff)
    trg_size = z.shape[0] if args.vocabulary_cutoff <= 0 else min(
        z.shape[0] - args.skip_top, args.vocabulary_cutoff)
    emb_dim = x.shape[1]

    cutoff_end = min(src_size + args.skip_top, x.shape[0])

    if args.trim_senses:
        # reshape sense assignment
        src_senses = src_senses[args.skip_top:cutoff_end]

        # new columns for words with no senses in original input
        ### TODO might also need this if not trimming (probably kinda far away)
        newcols = [csc_matrix(([1],([i],[0])),shape=(src_size,1)) for i in range(src_size)\
                   if src_senses.getrow(i).getnnz() == 0]
        #with open(f'data/synsets/dummy_synsets_v3b_{src_size}','wb') as dummy_cols_file:
        #    dummy_col_idcs = [i for i in range(src_size) if src_senses.getrow(i).getnnz() == 0]
        #    pickle.dump(np.array(dummy_col_idcs), dummy_cols_file)

        # trim senses no longer used, add new ones
        colsums = src_senses.sum(axis=0).tolist()[0]
        kept_senses = [i for i, j in enumerate(colsums) if j > 0]
        #with open(f'data/synsets/kept_synsets_v3b_{src_size}','wb') as kept_save_file:
        #    pickle.dump(np.array(kept_senses), kept_save_file)
        src_senses = hstack([src_senses[:, kept_senses]] + newcols)
        print(
            f'trimmed sense dictionary dimensions: {src_senses.shape} with {src_senses.getnnz()} nonzeros'
        )
    sense_size = src_senses.shape[1]

    if args.gold_pairs is not None:
        with open(args.gold_pairs, 'rb') as gold_pairs_f:
            gold_pairs = pickle.load(gold_pairs_f)
            gold_pairs = [(i-args.skip_top,j) for i,j in gold_pairs \
                          if i >= args.skip_top and i < src_senses.shape[0] and j < src_senses.shape[1]]
        gold_trgs = sorted(set([x[0] for x in gold_pairs]))
        gold_senses = sorted(set([x[1] for x in gold_pairs]))
        gold_domain_size = len(gold_trgs) * len(gold_senses)
        print(
            f'evaluating on {len(gold_pairs)} pairs with {len(gold_trgs)} unique words and {len(gold_senses)} unique senses'
        )

    # Initialize the concept embeddings from the source embeddings
    ### TODO maybe try gradient descent instead?
    ### TODO (pre-)create non-singular alignment matrix
    cc = xp.empty((sense_size, emb_dim), dtype=dtype)  # \tilde{E}
    t01 = time.time()
    print('starting psinv calc')
    src_sns_psinv = psinv(src_senses, dtype, args.inv_delta)
    xecc = x[args.skip_top:cutoff_end].T.dot(
        get_sparse_module(src_senses).toarray()).T  # sense_size * emb_dim
    cc[:] = src_sns_psinv.dot(xecc)
    print(f'initialized concept embeddings in {time.time()-t01:.2f} seconds',
          file=sys.stderr)
    if args.verbose:
        # report precision of psedo-inverse operation, checked by inverting
        pseudo_id = src_senses.transpose().dot(src_senses).dot(
            src_sns_psinv.get())
        real_id = sparse_id(sense_size)
        rel_diff = (pseudo_id - real_id).sum() / (sense_size * sense_size)
        print(f'per-coordinate pseudo-inverse precision is {rel_diff:.5f}')

    ### TODO initialize trg_senses using seed dictionary instead?
    trg_sns_size = trg_size if args.trim_senses else z.shape[0]
    trg_senses = csr_matrix(
        (trg_sns_size,
         sense_size))  # using non-cuda scipy because of 'inv' impl
    zecc = xp.empty_like(xecc)  # sense_size * emb_dim
    #tg_grad = xp.empty((trg_sns_size, sense_size))

    if args.gd:
        # everything can be done on gpu
        src_senses = get_sparse_module(src_senses, dtype=dtype)
        trg_senses = get_sparse_module(trg_senses, dtype=dtype)
        if args.sense_limit > 0.0:
            trg_sense_limit = int(args.sense_limit * src_senses.getnnz())
            if args.verbose:
                print(
                    f'limiting target side to {trg_sense_limit} sense mappings'
                )
        else:
            trg_sense_limit = -1

    ### TODO return memory assignment for similarities?

    # Training loop
    if args.gd:
        prox_lambda = args.base_prox_lambda
    else:
        lasso_model = Lasso(alpha=args.reglamb, fit_intercept=False, max_iter=args.lasso_iters,\
                            positive=True, warm_start=True)  # TODO more parametrization

    if args.log is not None:
        if args.gd:
            print(f'gradient descent lr: {args.gd_lr}', file=log)
            print(f'base proximal lambda: {args.base_prox_lambda}', file=log)
        else:
            print(f'lasso regularization: {args.reglamb}', file=log)
            print(f'lasso iterations: {args.lasso_iters}', file=log)
            print(f'inversion epsilon: {args.inv_delta}', file=log)
        if args.gold_pairs is not None:
            print(f'gold mappings: {len(gold_pairs)}', file=log)
        print(
            f'Iteration\tObjective\tSource\tTarget\tL_1\tDuration\tNonzeros\tCorrect_mappings',
            file=log)
        log.flush()

    best_objective = objective = 1000000000.
    correct_mappings = -1
    regularization_lambda = args.base_prox_lambda if args.gd else args.reglamb
    it = 1
    last_improvement = 0
    t = time.time()
    map_gd_lr = args.gd_lr
    emb_gd_lr = args.gd_lr
    end = False
    print('starting training')

    if args.start_src:
        print('starting with converging synset embeddings')
        it_range = range(
            args.iterations
        )  ### TODO possibly add arg, but there's early stopping
        if not args.verbose:
            it_range = tqdm(it_range)
        prev_obj = float('inf')
        for pre_it in it_range:
            if args.gd_wd:
                emb_gd_lr = args.gd_lr * pow(0.5, floor(
                    pre_it / args.gd_wd_hl))

            # Synset embedding
            cc_grad = src_senses.T.dot(
                xw[args.skip_top:cutoff_end] -
                src_senses.dot(cc)) - args.ccreglamb * cc
            cc_grad.clip(-args.gd_clip, args.gd_clip, out=cc_grad)
            cc += emb_gd_lr * cc_grad

            # Source projection
            u, s, vt = xp.linalg.svd(cc.T.dot(xecc))
            wx = vt.T.dot(u.T).astype(dtype)
            x.dot(wx, out=xw)

            pre_objective = ((xp.linalg.norm(
                xw[args.skip_top:cutoff_end] -
                get_sparse_module(src_senses).dot(cc), 'fro'))**2) / 2
            pre_objective = float(pre_objective)

            if args.verbose and pre_it > 0 and pre_it % 10 == 0:
                print(
                    f'source synset embedding objective iteration {pre_it}: {pre_objective:.3f}'
                )

            if pre_objective > prev_obj:
                print(
                    f'stopping at pre-iteration {pre_it}, source-sense objective {prev_obj:.3f}'
                )
                # revert
                cc -= emb_gd_lr * cc_grad
                break

            prev_obj = pre_objective

    while True:
        if it % 50 == 0:
            print(
                f'starting iteration {it}, last objective was {objective}, correct mappings at {correct_mappings}'
            )

        # Increase the keep probability if we have not improved in args.stochastic_interval iterations
        if it - last_improvement > args.stochastic_interval:
            last_improvement = it

        if args.iterations > 0 and it > args.iterations:
            end = True

        ### update target assignments (6) - lasso-esque regression
        time6 = time.time()
        # optimize: 0.5 * (xp.linalg.norm(zw[i] - trg_senses[i].dot(cc))^2) + (regularization_lambda * xp.linalg.norm(trg_senses[i],1))

        if args.trg_knn:
            # for csls-based neighborhoods
            knn_sense = xp.full(sense_size, -100)
            for i in range(0, sense_size, args.trg_batch):
                batch_end = min(i + args.trg_batch, sense_size)
                sim_sense_trg = cc[i:batch_end].dot(
                    zw[args.skip_top:cutoff_end].T)
                knn_sense[i:batch_end] = topk_mean(sim_sense_trg,
                                                   k=args.trg_sns_csls,
                                                   inplace=True)

            # calculate new target mappings
            trg_senses = lil_matrix(trg_senses.shape)
            for i in range(0, trg_size, args.trg_batch):
                sns_batch_end = min(i + args.trg_batch, trg_size)
                z_i = i + args.skip_top
                z_batch_end = min(sns_batch_end + args.skip_top, zw.shape[0])

                sims = zw[z_i:z_batch_end].dot(cc.T)
                sims -= knn_sense / 2  # equivalent to the real CSLS scores for NN
                best_idcs = sims.argmax(1).tolist()
                trg_senses[(list(range(i, sns_batch_end)),
                            best_idcs)] = sims.max(1).tolist()

                # second-to-lth-best
                for l in range(args.senses_per_trg - 1):
                    sims[(list(range(sims.shape[0])), best_idcs)] = 0.
                    best_idcs = sims.argmax(1).tolist()
                    trg_senses[(list(range(i, sns_batch_end)),
                                best_idcs)] = sims.max(1).tolist()

            trg_senses = get_sparse_module(trg_senses.tocsr())

        elif args.gd:
            ### TODO add args.skip_top calculations
            if args.gd_wd:
                true_it = (it - 1) * args.gd_map_steps
                map_gd_lr = args.gd_lr * pow(
                    0.5, floor((1 + true_it) / args.gd_wd_hl))
                if args.verbose:
                    print(f'mapping learning rate: {map_gd_lr}')

            for k in range(args.gd_map_steps):
                # st <- st + eta * (ew - st.dot(es)).dot(es.T)
                # allow up to sense_limit updates, clip gradient

                batch_grads = []
                for i in range(0, trg_size, args.trg_batch):
                    batch_end = min(i + args.trg_batch, trg_size)
                    tg_grad_b = (zw[i:batch_end] -
                                 trg_senses[i:batch_end].dot(cc)).dot(cc.T)

                    # proximal gradient
                    tg_grad_b += prox_lambda
                    tg_grad_b.clip(None, 0.0, out=tg_grad_b)
                    batch_grads.append(batch_sparse(tg_grad_b))

                tg_grad = get_sparse_module(vstack(batch_grads))
                del tg_grad_b

                if args.prox_decay:
                    prox_lambda *= args.base_prox_lambda

                ### TODO consider weight decay here as well (args.gd_wd)
                trg_senses -= map_gd_lr * tg_grad

                # allow up to sense_limit nonzeros
                if trg_sense_limit > 0:
                    trg_senses = trim_sparse(trg_senses,
                                             trg_sense_limit,
                                             clip=None)

            ### TODO consider finishing up with lasso (maybe only in final iteration)

        else:
            ### TODO add args.skip_top calculations
            # parallel LASSO (no cuda impl)
            cccpu = cc.get().T  # emb_dim * sense_size
            lasso_model.fit(cccpu, zw[:trg_size].get().T)
            ### TODO maybe trim, keep only above some threshold (0.05) OR top f(#it)
            trg_senses = lasso_model.sparse_coef_

        if args.verbose:
            print(
                f'target sense mapping step: {(time.time()-time6):.2f} seconds, {trg_senses.getnnz()} nonzeros',
                file=sys.stderr)
            objective = ((xp.linalg.norm(xw[args.skip_top:cutoff_end] - get_sparse_module(src_senses).dot(cc),'fro') ** 2)\
                            + (xp.linalg.norm(zw[args.skip_top:cutoff_end] - get_sparse_module(trg_senses).dot(cc),'fro')) ** 2) / 2 \
                        + regularization_lambda * trg_senses.sum()  # TODO consider thresholding reg part
            objective = float(objective)
            print(f'objective: {objective:.3f}')

        # Write target sense mapping
        with open(f'tmp_outs/{args.tsns_output[:-4]}-it{it:03d}.pkl',
                  mode='wb') as tsnsfile:
            pickle.dump(trg_senses.get(), tsnsfile)

        ### update synset embeddings (10)
        time10 = time.time()
        if args.gd and args.gd_emb_steps > 0:
            ### TODO probably handle sizes and/or threshold sparse matrix
            if args.gd_wd:
                true_it = (it - 1) * args.gd_emb_steps
                emb_gd_lr = args.gd_lr * pow(
                    0.5, floor((1 + true_it) / args.gd_wd_hl))
                if args.verbose:
                    print(f'embedding learning rate: {emb_gd_lr}')

            ### replace block for no-source-tuning mode
            all_senses = trg_senses if args.start_src else get_sparse_module(
                vstack((src_senses.get(), trg_senses.get()), format='csr'),
                dtype=dtype)
            aw = zw[args.
                    skip_top:cutoff_end] if args.start_src else xp.concatenate(
                        (xw[args.skip_top:cutoff_end],
                         zw[args.skip_top:cutoff_end]))

            for i in range(args.gd_emb_steps):
                cc_grad = all_senses.T.dot(
                    aw - all_senses.dot(cc)) - args.ccreglamb * cc
                cc_grad.clip(-args.gd_clip, args.gd_clip, out=cc_grad)
                cc += emb_gd_lr * cc_grad

        else:
            ### TODO add args.skip_top calculations
            all_senses = get_sparse_module(
                vstack((src_senses, trg_senses), format='csr'))
            xzecc = xp.concatenate((xw[:src_size], zw[:trg_size])).T\
                        .dot(all_senses.toarray()).T  # sense_size * emb_dim
            all_sns_psinv = psinv(
                all_senses.get(), dtype, args.inv_delta
            )  ### TODO only update target side? We still have src_sns_psinv [it doesn't matter, dimensions are the same]
            cc[:] = all_sns_psinv.dot(xzecc)

        if args.verbose:
            print(f'synset embedding update: {time.time()-time10:.2f}',
                  file=sys.stderr)
            objective = ((xp.linalg.norm(xw[args.skip_top:cutoff_end] - get_sparse_module(src_senses).dot(cc),'fro')) ** 2\
                            + (xp.linalg.norm(zw[args.skip_top:cutoff_end] - get_sparse_module(trg_senses).dot(cc),'fro')) ** 2) / 2 \
                        + regularization_lambda * trg_senses.sum()  # TODO consider thresholding reg part
            objective = float(objective)
            print(f'objective: {objective:.3f}')

        ### update projections (3,5)
        # write to zw and xw
        if args.orthogonal or not end:

            ### remove block for no-source-tuning mode
            # source side - mappings don't change so xecc is constant
            #if not args.start_src:  # need to do this anyway whenever cc updates
            time3 = time.time()
            u, s, vt = xp.linalg.svd(cc.T.dot(xecc))
            wx = vt.T.dot(u.T).astype(dtype)
            x.dot(wx, out=xw)
            if args.verbose:
                print(f'source projection update: {time.time()-time3:.2f}',
                      file=sys.stderr)

            # target side - compute sense mapping first
            time3 = time.time()
            zecc.fill(0.)
            for i in range(0, trg_size, args.trg_batch):
                end_idx = min(i + args.trg_batch, trg_size)
                zecc += z[i:end_idx].T.dot(
                    get_sparse_module(trg_senses[i:end_idx]).toarray()).T
            u, s, vt = xp.linalg.svd(cc.T.dot(zecc))
            wz = vt.T.dot(u.T).astype(dtype)
            z.dot(wz, out=zw)
            if args.verbose:
                print(f'target projection update: {time.time()-time3:.2f}',
                      file=sys.stderr)

        ### TODO add parts from 'advanced mapping' part - transformations, whitening, etc.

        # Objective function evaluation
        time_obj = time.time()
        trg_senses_l1 = float(trg_senses.sum())
        src_obj = (float(
            xp.linalg.norm(
                xw[args.skip_top:cutoff_end] -
                get_sparse_module(src_senses).dot(cc), 'fro'))**2) / 2
        trg_obj = (float(
            xp.linalg.norm(
                zw[args.skip_top:cutoff_end] -
                get_sparse_module(trg_senses).dot(cc), 'fro'))**2) / 2
        objective = src_obj + trg_obj + regularization_lambda * trg_senses_l1  # TODO consider thresholding reg part
        if args.verbose:
            print(f'objective calculation: {time.time()-time_obj:.2f}',
                  file=sys.stderr)

        if objective - best_objective <= -args.threshold:
            last_improvement = it
            best_objective = objective

        # WordNet transduction evaluation (can't tune on this)
        if args.gold_pairs is not None:
            np_trg_senses = trg_senses.get()
            trg_corr = [
                p for p in gold_pairs if np_trg_senses[p] > args.gold_threshold
            ]
            correct_mappings = len(trg_corr)
            domain_trgs = np_trg_senses[gold_trgs][:, gold_senses]
        else:
            correct_mappings = -1

        # Logging
        duration = time.time() - t
        if args.verbose:
            print('ITERATION {0} ({1:.2f}s)'.format(it, duration),
                  file=sys.stderr)
            print('objective: {0:.3f}'.format(objective), file=sys.stderr)
            print('target senses l_1 norm: {0:.3f}'.format(trg_senses_l1),
                  file=sys.stderr)
            if len(gold_pairs) > 0 and domain_trgs.getnnz() > 0:
                print(
                    f'{correct_mappings} correct target mappings: {(correct_mappings/len(gold_pairs)):.3f} recall, {(correct_mappings/domain_trgs.getnnz()):.3f} precision',
                    file=sys.stderr)
            print(file=sys.stderr)
            sys.stderr.flush()
        if args.log is not None:
            print(
                f'{it}\t{objective:.3f}\t{src_obj:.3f}\t{trg_obj:.3f}\t{trg_senses_l1:.3f}\t{duration:.3f}\t{trg_senses.getnnz()}\t{correct_mappings}',
                file=log)
            log.flush()

        if end:
            break

        t = time.time()
        it += 1

    # Write mapped embeddings
    with open(args.src_output,
              mode='w',
              encoding=args.encoding,
              errors='surrogateescape') as srcfile:
        embeddings.write(src_words, xw, srcfile)
    with open(args.trg_output,
              mode='w',
              encoding=args.encoding,
              errors='surrogateescape') as trgfile:
        embeddings.write(trg_words, zw, trgfile)

    # Write target sense mapping
    with open(args.tsns_output, mode='wb') as tsnsfile:
        pickle.dump(trg_senses.get(), tsnsfile)
예제 #8
0
def main():
    # Parse command line arguments
    # https://docs.python.org/3/library/argparse.html
    parser = argparse.ArgumentParser(
        description='Map word embeddings in two languages into a shared space')
    # description - This argument gives a brief description of what the program does and how it works.
    parser.add_argument('src_input', help='the input source embeddings')
    # help - A brief description of what the argument does.
    parser.add_argument('trg_input', help='the input target embeddings')
    parser.add_argument('src_output', help='the output source embeddings')
    parser.add_argument('trg_output', help='the output target embeddings')
    parser.add_argument(
        '--encoding',
        default='utf-8',
        help='the character encoding for input/output (defaults to utf-8)')
    # -- optional
    # default - The value produced if the argument is absent from the command line.
    parser.add_argument('--precision',
                        choices=['fp16', 'fp32', 'fp64'],
                        default='fp32',
                        help='the floating-point precision (defaults to fp32)')
    # choices - A container of the allowable values for the argument.
    parser.add_argument('--cuda',
                        action='store_true',
                        help='use cuda (requires cupy)')
    # action - The basic type of action to be taken when this argument is encountered at the command line.
    # store_ture - store true value
    parser.add_argument(
        '--batch_size',
        default=1000,
        type=int,
        help=
        'batch size (defaults to 10000); does not affect results, larger is usually faster but uses more memory'
    )
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='the random seed (defaults to 0)')
    parser.add_argument('--draw',
                        action='store_true',
                        help='use seaborn to draw')

    recommended_group = parser.add_argument_group(
        'recommended settings', 'Recommended settings for different scenarios')
    # add_argument_group() - returns an argument group object which has an add_argument() method just like a regular ArgumentParser.
    # it's a better conceptual grouping of arguments than this default one
    recommended_type = recommended_group.add_mutually_exclusive_group()
    # argparse will make sure that only one of the arguments in the mutually exclusive group was present on the command line
    recommended_type.add_argument(
        '--supervised',
        metavar='DICTIONARY',
        help='recommended if you have a large training dictionary')
    recommended_type.add_argument(
        '--semi_supervised',
        metavar='DICTIONARY',
        help='recommended if you have a small seed dictionary')
    recommended_type.add_argument(
        '--identical',
        action='store_true',
        help=
        'recommended if you have no seed dictionary but can rely on identical words'
    )
    recommended_type.add_argument(
        '--unsupervised',
        action='store_true',
        help=
        'recommended if you have no seed dictionary and do not want to rely on identical words'
    )
    recommended_type.add_argument('--acl2018',
                                  action='store_true',
                                  help='reproduce our ACL 2018 system')
    recommended_type.add_argument('--aaai2018',
                                  metavar='DICTIONARY',
                                  help='reproduce our AAAI 2018 system')
    # A name for the argument in usage messages
    recommended_type.add_argument(
        '--acl2017',
        action='store_true',
        help='reproduce our ACL 2017 system with numeral initialization')
    recommended_type.add_argument(
        '--acl2017_seed',
        metavar='DICTIONARY',
        help='reproduce our ACL 2017 system with a seed dictionary')
    recommended_type.add_argument('--emnlp2016',
                                  metavar='DICTIONARY',
                                  help='reproduce our EMNLP 2016 system')

    init_group = parser.add_argument_group(
        'advanced initialization arguments',
        'Advanced initialization arguments')
    init_type = init_group.add_mutually_exclusive_group()
    init_type.add_argument(
        '-d',
        '--init_dictionary',
        default=sys.stdin.fileno(),
        metavar='DICTIONARY',
        help='the training dictionary file (defaults to stdin)')
    init_type.add_argument('--init_identical',
                           action='store_true',
                           help='use identical words as the seed dictionary')
    init_type.add_argument(
        '--init_numerals',
        action='store_true',
        help=
        'use latin numerals (i.e. words matching [0-9]+) as the seed dictionary'
    )
    init_type.add_argument('--init_unsupervised',
                           action='store_true',
                           help='use unsupervised initialization')
    init_group.add_argument(
        '--unsupervised_vocab',
        type=int,
        default=0,
        help=
        'restrict the vocabulary to the top k entries for unsupervised initialization'
    )

    mapping_group = parser.add_argument_group(
        'advanced mapping arguments', 'Advanced embedding mapping arguments')
    mapping_group.add_argument(
        '--normalize',
        choices=['unit', 'center', 'unitdim', 'centeremb', 'none'],
        nargs='*',
        default=[],
        help='the normalization actions to perform in order')
    # no normalization in default
    mapping_group.add_argument('--whiten',
                               action='store_true',
                               help='whiten the embeddings')
    mapping_group.add_argument('--src_reweight',
                               type=float,
                               default=0,
                               nargs='?',
                               const=1,
                               help='re-weight the source language embeddings')
    mapping_group.add_argument('--trg_reweight',
                               type=float,
                               default=0,
                               nargs='?',
                               const=1,
                               help='re-weight the target language embeddings')
    mapping_group.add_argument('--src_dewhiten',
                               choices=['src', 'trg'],
                               help='de-whiten the source language embeddings')
    mapping_group.add_argument('--trg_dewhiten',
                               choices=['src', 'trg'],
                               help='de-whiten the target language embeddings')
    mapping_group.add_argument('--dim_reduction',
                               type=int,
                               default=0,
                               help='apply dimensionality reduction')
    mapping_type = mapping_group.add_mutually_exclusive_group()
    mapping_type.add_argument('-c',
                              '--orthogonal',
                              action='store_true',
                              help='use orthogonal constrained mapping')
    mapping_type.add_argument('-u',
                              '--unconstrained',
                              action='store_true',
                              help='use unconstrained mapping')

    self_learning_group = parser.add_argument_group(
        'advanced self-learning arguments',
        'Advanced arguments for self-learning')
    self_learning_group.add_argument('--self_learning',
                                     action='store_true',
                                     help='enable self-learning')
    self_learning_group.add_argument(
        '--vocabulary_cutoff',
        type=int,
        default=0,
        help='restrict the vocabulary to the top k entries')
    self_learning_group.add_argument(
        '--direction',
        choices=['forward', 'backward', 'union'],
        default='union',
        help='the direction for dictionary induction (defaults to union)')
    self_learning_group.add_argument('--csls',
                                     type=int,
                                     nargs='?',
                                     default=0,
                                     const=10,
                                     metavar='NEIGHBORHOOD_SIZE',
                                     dest='csls_neighborhood',
                                     help='use CSLS for dictionary induction')
    self_learning_group.add_argument(
        '--threshold',
        default=0.000001,
        type=float,
        help='the convergence threshold (defaults to 0.000001)')
    self_learning_group.add_argument(
        '--validation',
        default=None,
        metavar='DICTIONARY',
        help='a dictionary file for validation at each iteration')
    self_learning_group.add_argument(
        '--stochastic_initial',
        default=0.1,
        type=float,
        help=
        'initial keep probability stochastic dictionary induction (defaults to 0.1)'
    )
    self_learning_group.add_argument(
        '--stochastic_multiplier',
        default=2.0,
        type=float,
        help='stochastic dictionary induction multiplier (defaults to 2.0)')
    self_learning_group.add_argument(
        '--stochastic_interval',
        default=50,
        type=int,
        help='stochastic dictionary induction interval (defaults to 50)')
    self_learning_group.add_argument(
        '--log', help='write to a log file in tsv format at each iteration')
    self_learning_group.add_argument(
        '-v',
        '--verbose',
        action='store_true',
        help='write log information to stderr at each iteration')
    args = parser.parse_args()

    if args.supervised is not None:
        parser.set_defaults(init_dictionary=args.supervised,
                            normalize=['unit', 'center', 'unit'],
                            whiten=True,
                            src_reweight=0.5,
                            trg_reweight=0.5,
                            src_dewhiten='src',
                            trg_dewhiten='trg',
                            batch_size=1000)
    if args.semi_supervised is not None:
        parser.set_defaults(init_dictionary=args.semi_supervised,
                            normalize=['unit', 'center', 'unit'],
                            whiten=True,
                            src_reweight=0.5,
                            trg_reweight=0.5,
                            src_dewhiten='src',
                            trg_dewhiten='trg',
                            self_learning=True,
                            vocabulary_cutoff=20000,
                            csls_neighborhood=10)
    if args.identical:
        parser.set_defaults(init_identical=True,
                            normalize=['unit', 'center', 'unit'],
                            whiten=True,
                            src_reweight=0.5,
                            trg_reweight=0.5,
                            src_dewhiten='src',
                            trg_dewhiten='trg',
                            self_learning=True,
                            vocabulary_cutoff=20000,
                            csls_neighborhood=10)
    if args.unsupervised or args.acl2018:
        parser.set_defaults(init_unsupervised=True,
                            unsupervised_vocab=4000,
                            normalize=['unit', 'center', 'unit'],
                            whiten=True,
                            src_reweight=0.5,
                            trg_reweight=0.5,
                            src_dewhiten='src',
                            trg_dewhiten='trg',
                            self_learning=True,
                            vocabulary_cutoff=20000,
                            csls_neighborhood=10)
    if args.aaai2018:
        parser.set_defaults(init_dictionary=args.aaai2018,
                            normalize=['unit', 'center'],
                            whiten=True,
                            trg_reweight=1,
                            src_dewhiten='src',
                            trg_dewhiten='trg',
                            batch_size=1000)
    if args.acl2017:
        parser.set_defaults(init_numerals=True,
                            orthogonal=True,
                            normalize=['unit', 'center'],
                            self_learning=True,
                            direction='forward',
                            stochastic_initial=1.0,
                            stochastic_interval=1,
                            batch_size=1000)
    if args.acl2017_seed:
        parser.set_defaults(init_dictionary=args.acl2017_seed,
                            orthogonal=True,
                            normalize=['unit', 'center'],
                            self_learning=True,
                            direction='forward',
                            stochastic_initial=1.0,
                            stochastic_interval=1,
                            batch_size=1000)
    if args.emnlp2016:
        parser.set_defaults(init_dictionary=args.emnlp2016,
                            orthogonal=True,
                            normalize=['unit', 'center'],
                            batch_size=1000)
    args = parser.parse_args()

    # Check command line arguments
    if (args.src_dewhiten is not None
            or args.trg_dewhiten is not None) and not args.whiten:
        print('ERROR: De-whitening requires whitening first', file=sys.stderr)
        sys.exit(-1)

    # Choose the right dtype for the desired precision
    if args.precision == 'fp16':
        dtype = 'float16'
    elif args.precision == 'fp32':
        dtype = 'float32'
    elif args.precision == 'fp64':
        dtype = 'float64'

    # Read input embeddings
    srcfile = open(args.src_input,
                   encoding=args.encoding,
                   errors='surrogateescape')
    trgfile = open(args.trg_input,
                   encoding=args.encoding,
                   errors='surrogateescape')
    src_words, x = embeddings.read(srcfile, dtype=dtype)
    trg_words, z = embeddings.read(trgfile, dtype=dtype)

    # NumPy/CuPy management
    if args.cuda:
        if not supports_cupy():
            print('ERROR: Install CuPy for CUDA support', file=sys.stderr)
            sys.exit(-1)
        xp = get_cupy()
        x = xp.asarray(x)
        z = xp.asarray(z)
    else:
        xp = np
    # fix random seed
    xp.random.seed(args.seed)

    # Build word to index map
    src_word2ind = {word: i for i, word in enumerate(src_words)}
    trg_word2ind = {word: i for i, word in enumerate(trg_words)}

    # STEP 0: Normalization
    embeddings.normalize(x, args.normalize)
    embeddings.normalize(z, args.normalize)

    # Build the seed dictionary
    src_indices = []
    trg_indices = []
    dict_size = 5000
    if args.init_unsupervised:
        sim_size = min(x.shape[0],
                       z.shape[0]) if args.unsupervised_vocab <= 0 else min(
                           x.shape[0], z.shape[0], args.unsupervised_vocab)
        u, s, vt = xp.linalg.svd(x[:sim_size], full_matrices=False)
        xsim = (u * s).dot(u.T)
        u, s, vt = xp.linalg.svd(z[:sim_size], full_matrices=False)
        zsim = (u * s).dot(u.T)
        del u, s, vt
        xsim.sort(axis=1)
        zsim.sort(axis=1)
        embeddings.normalize(xsim, args.normalize)
        embeddings.normalize(zsim, args.normalize)
        sim = xsim.dot(zsim.T)
        if args.csls_neighborhood > 0:
            knn_sim_fwd = topk_mean(sim, k=args.csls_neighborhood)
            knn_sim_bwd = topk_mean(sim.T, k=args.csls_neighborhood)
            sim -= knn_sim_fwd[:, xp.newaxis] / 2 + knn_sim_bwd / 2
        if args.direction == 'forward':
            src_indices = xp.arange(sim_size)
            trg_indices = sim.argmax(axis=1)
        elif args.direction == 'backward':
            src_indices = sim.argmax(axis=0)
            trg_indices = xp.arange(sim_size)
        elif args.direction == 'union':
            src_indices = xp.concatenate(
                (xp.arange(sim_size), sim.argmax(axis=0)))
            trg_indices = xp.concatenate(
                (sim.argmax(axis=1), xp.arange(sim_size)))
        del xsim, zsim, sim
    elif args.init_numerals:
        numeral_regex = re.compile('^[0-9]+$')
        # ^ match from start of words $ match to end of words
        # consider numbers from 0 to 9
        # http://www.runoob.com/python/python-reg-expressions.html
        src_numerals = {
            word
            for word in src_words if numeral_regex.match(word) is not None
        }
        trg_numerals = {
            word
            for word in trg_words if numeral_regex.match(word) is not None
        }
        numerals = src_numerals.intersection(trg_numerals)
        for word in numerals:
            src_indices.append(src_word2ind[word])
            trg_indices.append(trg_word2ind[word])
    elif args.init_identical:
        identical = set(src_words).intersection(set(trg_words))
        for word in identical:
            src_indices.append(src_word2ind[word])
            trg_indices.append(trg_word2ind[word])
    else:
        f = open(args.init_dictionary,
                 encoding=args.encoding,
                 errors='surrogateescape')
        for line in f:
            src, trg = line.split()
            try:
                src_ind = src_word2ind[src]
                trg_ind = trg_word2ind[trg]
                src_indices.append(src_ind)
                trg_indices.append(trg_ind)
            except KeyError:
                print('WARNING: OOV dictionary entry ({0} - {1})'.format(
                    src, trg),
                      file=sys.stderr)
            if len(src_indices) == dict_size:
                break
    # Read validation dictionary
    if args.validation is not None:
        f = open(args.validation,
                 encoding=args.encoding,
                 errors='surrogateescape')
        validation = collections.defaultdict(set)
        oov = set()
        vocab = set()
        for line in f:
            src, trg = line.split()
            try:
                src_ind = src_word2ind[src]
                trg_ind = trg_word2ind[trg]
                validation[src_ind].add(trg_ind)
                vocab.add(src)
            except KeyError:
                oov.add(src)
        oov -= vocab  # If one of the translation options is in the vocabulary, then the entry is not an oov
        validation_coverage = len(validation) / (len(validation) + len(oov))

    # Create log file
    if args.log:
        log = open(args.log,
                   mode='w',
                   encoding=args.encoding,
                   errors='surrogateescape')

    # Allocate memory
    xw = xp.empty_like(x)
    zw = xp.empty_like(z)
    # choose to cut-off or not
    src_size = x.shape[0] if args.vocabulary_cutoff <= 0 else min(
        x.shape[0], args.vocabulary_cutoff)
    trg_size = z.shape[0] if args.vocabulary_cutoff <= 0 else min(
        z.shape[0], args.vocabulary_cutoff)
    simfwd = xp.empty((args.batch_size, trg_size), dtype=dtype)
    simbwd = xp.empty((args.batch_size, src_size), dtype=dtype)
    if args.validation is not None:
        simval = xp.empty((len(validation.keys()), z.shape[0]), dtype=dtype)

    best_sim_forward = xp.full(src_size, -100, dtype=dtype)
    src_indices_forward = xp.arange(src_size)
    trg_indices_forward = xp.zeros(src_size, dtype=int)
    best_sim_backward = xp.full(trg_size, -100, dtype=dtype)
    src_indices_backward = xp.zeros(trg_size, dtype=int)
    trg_indices_backward = xp.arange(trg_size)
    knn_sim_fwd = xp.zeros(src_size, dtype=dtype)
    knn_sim_bwd = xp.zeros(trg_size, dtype=dtype)

    # Training loop
    best_objective = objective = -100.
    it = 1
    last_improvement = 0
    keep_prob = args.stochastic_initial
    t = time.time()
    end = not args.self_learning
    while True:

        # Increase the keep probability if we have not improve in args.stochastic_interval iterations
        # for init-numeral : if objective doesn's increase after 1 iteration, then stop it directly
        if it - last_improvement > args.stochastic_interval:
            if keep_prob >= 1.0:
                end = True
            keep_prob = min(1.0, args.stochastic_multiplier * keep_prob)
            last_improvement = it

        # Update the embedding mapping
        if args.orthogonal or not end:  # orthogonal mapping
            u, s, vt = xp.linalg.svd(z[trg_indices].T.dot(x[src_indices]))
            w = vt.T.dot(u.T)
            x.dot(w, out=xw)
            zw[:] = z
        elif args.unconstrained:  # unconstrained mapping
            x_pseudoinv = xp.linalg.inv(x[src_indices].T.dot(
                x[src_indices])).dot(x[src_indices].T)
            w = x_pseudoinv.dot(z[trg_indices])
            x.dot(w, out=xw)
            zw[:] = z
        else:  # advanced mapping

            # TODO xw.dot(wx2, out=xw) and alike not working
            xw[:] = x
            zw[:] = z

            # STEP 1: Whitening
            def whitening_transformation(m):
                u, s, vt = xp.linalg.svd(m, full_matrices=False)
                return vt.T.dot(xp.diag(1 / s)).dot(vt)

            if args.whiten:
                wx1 = whitening_transformation(xw[src_indices])
                wz1 = whitening_transformation(zw[trg_indices])
                xw = xw.dot(wx1)
                zw = zw.dot(wz1)

            # STEP 2: Orthogonal mapping
            wx2, s, wz2_t = xp.linalg.svd(xw[src_indices].T.dot(
                zw[trg_indices]))
            wz2 = wz2_t.T
            xw = xw.dot(wx2)
            zw = zw.dot(wz2)

            # STEP 3: Re-weighting
            xw *= s**args.src_reweight
            zw *= s**args.trg_reweight

            # STEP 4: De-whitening
            if args.src_dewhiten == 'src':
                xw = xw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2))
            elif args.src_dewhiten == 'trg':
                xw = xw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2))
            if args.trg_dewhiten == 'src':
                zw = zw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2))
            elif args.trg_dewhiten == 'trg':
                zw = zw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2))

            # STEP 5: Dimensionality reduction
            if args.dim_reduction > 0:
                xw = xw[:, :args.dim_reduction]
                zw = zw[:, :args.dim_reduction]

        # Self-learning
        if end:
            break
        else:
            # Update the training dictionary
            if args.direction in ('forward', 'union'):
                if args.csls_neighborhood > 0:
                    for i in range(0, trg_size, simbwd.shape[0]):
                        j = min(i + simbwd.shape[0], trg_size)
                        zw[i:j].dot(xw[:src_size].T, out=simbwd[:j - i])
                        knn_sim_bwd[i:j] = topk_mean(simbwd[:j - i],
                                                     k=args.csls_neighborhood,
                                                     inplace=True)
                for i in range(0, src_size, simfwd.shape[0]):
                    j = min(i + simfwd.shape[0], src_size)
                    xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j - i])
                    simfwd[:j - i].max(axis=1, out=best_sim_forward[i:j])
                    simfwd[:j -
                           i] -= knn_sim_bwd / 2  # Equivalent to the real CSLS scores for NN
                    dropout(simfwd[:j - i],
                            1 - keep_prob).argmax(axis=1,
                                                  out=trg_indices_forward[i:j])
            if args.direction in ('backward', 'union'):
                if args.csls_neighborhood > 0:
                    for i in range(0, src_size, simfwd.shape[0]):
                        j = min(i + simfwd.shape[0], src_size)
                        xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j - i])
                        knn_sim_fwd[i:j] = topk_mean(simfwd[:j - i],
                                                     k=args.csls_neighborhood,
                                                     inplace=True)
                for i in range(0, trg_size, simbwd.shape[0]):
                    j = min(i + simbwd.shape[0], trg_size)
                    zw[i:j].dot(xw[:src_size].T, out=simbwd[:j - i])
                    simbwd[:j - i].max(axis=1, out=best_sim_backward[i:j])
                    simbwd[:j -
                           i] -= knn_sim_fwd / 2  # Equivalent to the real CSLS scores for NN
                    dropout(simbwd[:j - i], 1 - keep_prob).argmax(
                        axis=1, out=src_indices_backward[i:j])
            if args.direction == 'forward':
                src_indices = src_indices_forward
                trg_indices = trg_indices_forward
            elif args.direction == 'backward':
                src_indices = src_indices_backward
                trg_indices = trg_indices_backward
            elif args.direction == 'union':
                src_indices = xp.concatenate(
                    (src_indices_forward, src_indices_backward))
                trg_indices = xp.concatenate(
                    (trg_indices_forward, trg_indices_backward))

            # Objective function evaluation
            if args.direction == 'forward':
                objective = xp.mean(best_sim_forward).tolist()
            elif args.direction == 'backward':
                objective = xp.mean(best_sim_backward).tolist()
            elif args.direction == 'union':
                objective = (xp.mean(best_sim_forward) +
                             xp.mean(best_sim_backward)).tolist() / 2
            if objective - best_objective >= args.threshold:
                last_improvement = it
                best_objective = objective

            # Accuracy and similarity evaluation in validation
            if args.validation is not None:
                src = list(validation.keys())
                xw[src].dot(zw.T, out=simval)
                nn = asnumpy(simval.argmax(axis=1))
                accuracy = np.mean([
                    1 if nn[i] in validation[src[i]] else 0
                    for i in range(len(src))
                ])
                similarity = np.mean([
                    max([simval[i, j].tolist() for j in validation[src[i]]])
                    for i in range(len(src))
                ])

            # Logging
            duration = time.time() - t
            if args.verbose:
                print(file=sys.stderr)
                print('ITERATION {0} ({1:.2f}s)'.format(it, duration),
                      file=sys.stderr)
                print('\t- Objective:        {0:9.4f}%'.format(100 *
                                                               objective),
                      file=sys.stderr)
                print(
                    '\t- Drop probability: {0:9.4f}%'.format(100 -
                                                             100 * keep_prob),
                    file=sys.stderr)
                if args.validation is not None:
                    print('\t- Val. similarity:  {0:9.4f}%'.format(100 *
                                                                   similarity),
                          file=sys.stderr)
                    print('\t- Val. accuracy:    {0:9.4f}%'.format(100 *
                                                                   accuracy),
                          file=sys.stderr)
                    print('\t- Val. coverage:    {0:9.4f}%'.format(
                        100 * validation_coverage),
                          file=sys.stderr)
                sys.stderr.flush()
            if args.log is not None:
                val = '{0:.6f}\t{1:.6f}\t{2:.6f}'.format(
                    100 * similarity, 100 * accuracy, 100 *
                    validation_coverage) if args.validation is not None else ''
                print('{0}\t{1:.6f}\t{2}\t{3:.6f}'.format(
                    it, 100 * objective, val, duration),
                      file=log)
                log.flush()

        t = time.time()
        it += 1

    # draw distribution of language space
    if args.draw:
        PCA_model = PCA(n_components=2)
        x_PCA = PCA_model.fit_transform(asnumpy(xw))
        x1 = [feature[0] for feature in x_PCA]
        y1 = [feature[1] for feature in x_PCA]
        z_PCA = PCA_model.fit_transform(asnumpy(zw))
        x2 = [feature[0] for feature in z_PCA]
        y2 = [feature[1] for feature in z_PCA]
        '''
        # draw with plt
        plt.scatter(x2, y2, s=10, c='r', alpha=0.4)
        plt.scatter(x1, y1, s=10, c='b', alpha=0.2)
        plt.savefig('./share_space.png')
        '''
        # draw with seaborn
        plt.figure()
        sns.jointplot(x1, y1, kind='hex', color='b')
        plt.savefig('./src_mapped_emb.png')
        plt.figure()
        sns.jointplot(x2, y2, kind='hex', color='g')
        plt.savefig('./trg_mapped_emb.png')

    # Write mapped embeddings
    srcfile = open(args.src_output,
                   mode='w',
                   encoding=args.encoding,
                   errors='surrogateescape')
    trgfile = open(args.trg_output,
                   mode='w',
                   encoding=args.encoding,
                   errors='surrogateescape')
    embeddings.write(src_words, xw, srcfile)
    embeddings.write(trg_words, zw, trgfile)
    srcfile.close()
    trgfile.close()
예제 #9
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(
        description='Map word embeddings in two languages into a shared space')
    parser.add_argument('src_input', help='the input source embeddings')
    parser.add_argument('trg_input', help='the input target embeddings')
    parser.add_argument('src_output', help='the output source embeddings')
    parser.add_argument('trg_output', help='the output target embeddings')
    parser.add_argument(
        '--encoding',
        default='utf-8',
        help='the character encoding for input/output (defaults to utf-8)')
    parser.add_argument('--precision',
                        choices=['fp16', 'fp32', 'fp64'],
                        default='fp32',
                        help='the floating-point precision (defaults to fp32)')
    parser.add_argument('--cuda',
                        action='store_true',
                        help='use cuda (requires cupy)')
    parser.add_argument(
        '--batch_size',
        default=10000,
        type=int,
        help=
        'batch size (defaults to 10000); does not affect results, larger is usually faster but uses more memory'
    )
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='the random seed (defaults to 0)')
    parser.add_argument('--maxiter',
                        type=int,
                        default=10,
                        help='max number of iterations')
    parser.add_argument('--corekbest',
                        type=int,
                        default=2,
                        help='nn ranking to be considered as a match')
    parser.add_argument('--decayrate',
                        type=float,
                        default=1.01,
                        help='for boosting')
    parser.add_argument('--init_vocab',
                        type=int,
                        default=10000,
                        help='for boosting')
    parser.add_argument('--dictname',
                        default='dict.tmp',
                        help='output the dictionary')

    recommended_type = parser.add_argument_group(
        'recommended settings', 'Recommended settings for different scenarios')
    recommended_type.add_argument(
        '--supervised',
        metavar='DICTIONARY',
        help='recommended if you have a large training dictionary')
    recommended_type.add_argument(
        '--identical',
        default=True,
        help=
        'recommended if you have no seed dictionary but can rely on identical words'
    )

    init_group = parser.add_argument_group(
        'advanced initialization arguments',
        'Advanced initialization arguments')
    init_type = init_group.add_mutually_exclusive_group()
    init_type.add_argument(
        '-d',
        '--init_dictionary',
        default=sys.stdin.fileno(),
        metavar='DICTIONARY',
        help='the training dictionary file (defaults to stdin)')
    init_type.add_argument('--init_identical',
                           action='store_true',
                           help='use identical words as the seed dictionary')
    init_type.add_argument(
        '--init_numerals',
        action='store_true',
        help=
        'use latin numerals (i.e. words matching [0-9]+) as the seed dictionary'
    )
    init_type.add_argument('--init_unsupervised',
                           action='store_true',
                           help='use unsupervised initialization')
    init_group.add_argument(
        '--unsupervised_vocab',
        type=int,
        default=0,
        help=
        'restrict the vocabulary to the top k entries for unsupervised initialization'
    )

    mapping_group = parser.add_argument_group(
        'advanced mapping arguments', 'Advanced embedding mapping arguments')
    mapping_group.add_argument(
        '--normalize',
        choices=['unit', 'center', 'unitdim', 'centeremb', 'none'],
        nargs='*',
        default=[],
        help='the normalization actions to perform in order')
    mapping_group.add_argument('--vocabulary', help='restrict source vocab')
    mapping_type = mapping_group.add_mutually_exclusive_group()
    mapping_type.add_argument('-c',
                              '--orthogonal',
                              action='store_true',
                              help='use orthogonal constrained mapping')
    mapping_type.add_argument('-u',
                              '--unconstrained',
                              action='store_true',
                              help='use unconstrained mapping')

    self_learning_group = parser.add_argument_group(
        'advanced self-learning arguments',
        'Advanced arguments for self-learning')
    self_learning_group.add_argument(
        '--vocabulary_cutoff',
        type=int,
        default=0,
        help='restrict the vocabulary to the top k entries')
    self_learning_group.add_argument('--csls',
                                     type=int,
                                     nargs='?',
                                     default=0,
                                     const=10,
                                     metavar='NEIGHBORHOOD_SIZE',
                                     dest='csls_neighborhood',
                                     help='use CSLS for dictionary induction')
    self_learning_group.add_argument(
        '--validation',
        default=None,
        metavar='DICTIONARY',
        help='a dictionary file for validation at each iteration')
    self_learning_group.add_argument(
        '--log', help='write to a log file in tsv format at each iteration')
    self_learning_group.add_argument(
        '-v',
        '--verbose',
        action='store_true',
        help='write log information to stderr at each iteration')
    args = parser.parse_args()

    parser.set_defaults(init_dictionary=args.supervised,
                        normalize=['unit', 'center', 'unit'])
    args = parser.parse_args()
    print(args, file=sys.stderr)

    # Choose the right dtype for the desired precision
    if args.precision == 'fp16':
        dtype = 'float16'
    elif args.precision == 'fp32':
        dtype = 'float32'
    elif args.precision == 'fp64':
        dtype = 'float64'

    os.makedirs(OUTPUTDIR, exist_ok=True)

    # Read input embeddings
    vocabulary = None
    if args.vocabulary is not None:
        vocabulary = set()
        with open(args.vocabulary,
                  encoding=args.encoding,
                  errors='surrogateescape') as file:
            for l in file:
                vocabulary.add(l.split()[0])
        print(f'vocab size:\t{len(vocabulary)}')

    with open(args.src_input, encoding=args.encoding, errors='surrogateescape') as srcfile, \
            open(args.trg_input, encoding=args.encoding, errors='surrogateescape') as trgfile:
        src_words, x = embeddings.read(srcfile,
                                       dtype=dtype,
                                       threshold=args.vocabulary_cutoff,
                                       vocabulary=vocabulary)
        trg_words, z = embeddings.read(trgfile,
                                       dtype=dtype,
                                       threshold=args.vocabulary_cutoff)
        embeddings.normalize(x, args.normalize)
        embeddings.normalize(z, args.normalize)
    # NumPy/CuPy management
    if args.cuda:
        if not supports_cupy():
            print('ERROR: Install CuPy for CUDA support', file=sys.stderr)
            sys.exit(-1)
        xp = get_cupy()
        x = xp.asarray(x)
        z = xp.asarray(z)
    else:
        xp = np
    xp.random.seed(args.seed)

    # Build word to index map
    src_word2ind = {word: i for i, word in enumerate(src_words)}
    trg_word2ind = {word: i for i, word in enumerate(trg_words)}

    # Build the seed dictionary
    src_indices = []
    trg_indices = []

    if args.supervised:
        f = open(args.init_dictionary,
                 encoding=args.encoding,
                 errors='surrogateescape')
        for line in f:
            try:
                src, trg = line.split()[:2]
            except ValueError:
                continue
            try:
                src_ind = src_word2ind[src]
                trg_ind = trg_word2ind[trg]
                src_indices.append(src_ind)
                trg_indices.append(trg_ind)
            except KeyError:
                print('WARNING: OOV dictionary entry ({0} - {1})'.format(
                    src, trg),
                      file=sys.stderr)

    # Read validation dictionary
    if args.validation is not None:
        print('reading validation', file=sys.stderr)
        f = open(args.validation,
                 encoding=args.encoding,
                 errors='surrogateescape')
        validation = collections.defaultdict(set)
        oov = set()
        vocab = set()
        for line in f:
            try:
                src, trg = line.split()
            except ValueError:
                continue
            try:
                src_ind = src_word2ind[src]
                trg_ind = trg_word2ind[trg]
                validation[src_ind].add(trg_ind)
                vocab.add(src)
            except KeyError:
                oov.add(src)
        oov -= vocab  # If one of the translation options is in the vocabulary, then the entry is not an oov
        validation_coverage = len(validation) / (len(validation) + len(oov))

    # Create log file
    if args.log:
        log = open(args.log,
                   mode='w',
                   encoding=args.encoding,
                   errors='surrogateescape')

    # Allocate memory
    xw = xp.empty_like(x)
    zw = xp.empty_like(z)

    matches = collections.Counter()
    decided = collections.Counter()
    cum_weights = collections.Counter(matches)
    score = collections.Counter()
    for p in zip(src_indices, trg_indices):
        matches[p] = 1
        decided[p] = 1
    identical = set(src_words).intersection(set(trg_words))
    for word in list(identical):
        p = (src_word2ind[word], trg_word2ind[word])
        matches[p] = 1
        decided[p] = 1

    if args.validation is not None:
        simval = xp.empty((len(validation.keys()), z.shape[0]), dtype=dtype)

    # Training loop
    it = 1
    t = time.time()
    wprev = 0
    current_vocab = args.init_vocab
    Stats = collections.namedtuple(
        'MatchStats',
        ['w_dot', 'mean_dot', 'delta_w', 'current_vocab', 'len_match'])
    pstats = None
    stats = None
    while True:
        src_indices, trg_indices, weights = flatten_match(matches, matches)
        # x, z = np.array(x0), np.array(z0)

        embeddings.noise(x)
        embeddings.noise(z)

        if args.unconstrained:
            w = np.linalg.lstsq(np.sqrt(weights) * x[src_indices],
                                np.sqrt(weights) * z[trg_indices],
                                rcond=None)[0]
            # w = np.linalg.lstsq(x[src_indices], z[trg_indices], rcond=None)[0]
            x.dot(w, out=xw)
            zw = z[:]
        else:
            u, s, vt = xp.linalg.svd(
                (weights * z[trg_indices]).T.dot(x[src_indices]))
            # u, s, vt = xp.linalg.svd(z[trg_indices].T.dot(x[src_indices]))
            w = vt.T.dot(u.T)
            x.dot(w, out=xw)
            zw = z[:]
            w_dot = np.sum(
                weights * z[trg_indices] * xw[src_indices]) / weights.sum()
            mean_dot = np.sum(
                z[trg_indices] * xw[src_indices]) / len(src_indices)
            delta_w = np.linalg.norm(w - wprev)
            stats = Stats(w_dot=w_dot,
                          mean_dot=mean_dot,
                          delta_w=delta_w,
                          current_vocab=current_vocab,
                          len_match=len(src_indices))

        if it > 1 and stats.w_dot < pstats.w_dot:
            current_vocab = min(int(current_vocab * 1.1),
                                args.vocabulary_cutoff)

        T = 1 * np.exp((it - 1) * np.log(1e-2) / (args.maxiter))
        # T = 1
        score = collections.Counter()
        cum_weights = collections.Counter()
        matches, objective = find_matches(xw,
                                          zw,
                                          cum_weights,
                                          score,
                                          ul=current_vocab,
                                          T=T,
                                          kbest=args.corekbest,
                                          csls=args.csls_neighborhood,
                                          decay=args.decayrate)

        for m in decided:
            decided[m] = decided[m] * (1 - 1 / it)

        for m in score:
            if m in score:
                eta = 1 / it
            else:
                eta = max(0.5, 1 / it)
            decided[m] = decided[m] * (1 - eta) + score[m] * eta

        # Accuracy and similarity evaluation in validation
        if args.validation is not None:
            src = list(validation.keys())
            xw[src].dot(zw.T, out=simval)
            nn = asnumpy(simval.argmax(axis=1))
            accuracy = np.mean([
                1 if nn[i] in validation[src[i]] else 0
                for i in range(len(src))
            ])
            similarity = np.mean([
                np.max([simval[i, j].tolist() for j in validation[src[i]]])
                for i in range(len(src))
            ])

        with open(f'{OUTPUTDIR}/{args.dictname}.{it}', mode='w') as f:
            for p in decided.most_common():
                si, ti = p[0]
                print(f'{src_words[si]}\t{trg_words[ti]}\t{p[1]:.3e}', file=f)

        # Logging
        duration = time.time() - t

        if args.verbose:
            print(file=sys.stderr)
            print('ITERATION {0} ({1:.2f}s)'.format(it, duration),
                  file=sys.stderr)
            print('\t- Objective:        {0:9.4f}%'.format(100 * objective),
                  file=sys.stderr)
            print(
                f'\t- #match/#decided:             {len(src_indices)}/{len(decided)}',
                file=sys.stderr)
            print(stats, file=sys.stderr)
            if args.validation is not None:
                print('\t- Val. similarity:  {0:9.4f}%'.format(100 *
                                                               similarity),
                      file=sys.stderr)
                print('\t- Val. accuracy:    {0:9.4f}%'.format(100 * accuracy),
                      file=sys.stderr)
                print('\t- Val. coverage:    {0:9.4f}%'.format(
                    100 * validation_coverage),
                      file=sys.stderr)
            sys.stderr.flush()
        if args.log is not None:
            val = '{0:.6f}\t{1:.6f}\t{2:.6f}'.format(
                100 * similarity, 100 * accuracy, 100 *
                validation_coverage) if args.validation is not None else ''
            print('{0}\t{1:.6f}\t{2}\t{3:.6f}'.format(it, 100 * objective, val,
                                                      duration),
                  file=log)
            log.flush()

        if it >= args.maxiter:
            break
        t = time.time()
        wprev = w
        pstats = stats
        it += 1

    # write mapped embeddings
    print('**** reading and writing final embeddings ****', file=sys.stderr)
    with open(args.src_input, encoding=args.encoding, errors='surrogateescape') as srcfile, \
            open(args.trg_input, encoding=args.encoding, errors='surrogateescape') as trgfile:
        src_words, x = embeddings.read(srcfile, dtype=dtype, threshold=100000)
        trg_words, z = embeddings.read(trgfile, dtype=dtype, threshold=100000)

    embeddings.normalize(x, args.normalize)
    embeddings.normalize(z, args.normalize)

    with open(args.src_output, mode='w', encoding=args.encoding, errors='surrogateescape') as srcfile, \
            open(args.trg_output, mode='w', encoding=args.encoding, errors='surrogateescape') as trgfile:
        embeddings.write(src_words, x.dot(w), srcfile)
        embeddings.write(trg_words, z, trgfile)