示例#1
0
    def make_dataset(self, root):
        root = root.split(' ')
        view1 = open(root[0], encoding='utf-8', errors='surrogateescape')
        view2 = open(root[1], encoding='utf-8', errors='surrogateescape')
        src_words, view1_vec = embeddings.read(view1)
        trg_words, view2_vec = embeddings.read(view2)

        view1_vec = embeddings.length_normalize(view1_vec)
        view2_vec = embeddings.length_normalize(view2_vec)

        view1.close()
        view2.close()

        return torch.from_numpy(np.column_stack((view1_vec, view2_vec)))
示例#2
0
def normalize_emb(emb, method):
    """
    Normalize input embedding based on the choice of method
    """
    print(f"Normalizing using {method}")
    if method == 'unit':
        emb = embeddings.length_normalize(emb)
    elif method == 'center':
        emb = embeddings.mean_center(emb)
    elif method == 'unitdim':
        emb = embeddings.length_normalize_dimensionwise(emb)
    elif method == 'centeremb':
        emb = embeddings.mean_center_embeddingwise(emb)

    return emb
示例#3
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(description='Normalize word embeddings')
    parser.add_argument(
        'actions',
        choices=['none', 'unit', 'center', 'unitdim', 'centeremb'],
        nargs='+',
        help='the actions to perform in order')
    parser.add_argument(
        '-i',
        '--input',
        default=sys.stdin.fileno(),
        help='the input word embedding file (defaults to stdin)')
    parser.add_argument(
        '-o',
        '--output',
        default=sys.stdout.fileno(),
        help='the output word embedding file (defaults to stdout)')
    parser.add_argument(
        '--encoding',
        default='utf-8',
        action='store_true',
        help='the character encoding for input/output (defaults to utf-8)')
    args = parser.parse_args()

    # Read input embeddings

    f = open(args.input, encoding=args.encoding, errors='surrogateescape')
    words, matrix = embeddings.read(f)

    # Perform normalization actions
    for action in args.actions:
        if action == 'unit':
            matrix = embeddings.length_normalize(matrix)
        elif action == 'center':
            matrix = embeddings.mean_center(matrix)
        elif action == 'unitdim':
            matrix = embeddings.length_normalize_dimensionwise(matrix)
        elif action == 'centeremb':
            matrix = embeddings.mean_center_embeddingwise(matrix)

    # Write normalized embeddings
    f = open(args.output,
             mode='w',
             encoding=args.encoding,
             errors='surrogateescape')
    embeddings.write(words, matrix, f)
示例#4
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(
        description='Generate latent space embeddings')
    parser.add_argument('emb1', help='path to embedding 1')
    parser.add_argument('emb2', help='path to embedding 2')
    parser.add_argument(
        '--geomm_embeddings_path',
        default=None,
        type=str,
        help=
        'directory to save the output GeoMM latent space embeddings. The output embeddings are normalized.'
    )
    parser.add_argument(
        '--encoding',
        default='utf-8',
        help='the character encoding for input/output (defaults to utf-8)')
    parser.add_argument('--verbose', default=0, type=int, help='Verbose')
    mapping_group = parser.add_argument_group(
        'mapping arguments', 'Basic embedding mapping arguments')
    mapping_group.add_argument('--dictionary',
                               default=sys.stdin.fileno(),
                               help='the dictionary file (defaults to stdin)')
    mapping_group.add_argument(
        '--normalize',
        choices=['unit', 'center', 'unitdim', 'centeremb', 'no'],
        nargs=2,
        default=[],
        help=
        'the normalization actions performed in sequence for embeddings 1 and 2'
    )

    geomm_group = parser.add_argument_group('GeoMM arguments',
                                            'Arguments for GeoMM method')
    geomm_group.add_argument('--l2_reg',
                             type=float,
                             default=1e2,
                             help='Lambda for L2 Regularization')
    geomm_group.add_argument(
        '--max_opt_time',
        type=int,
        default=5000,
        help='Maximum time limit for optimization in seconds')
    geomm_group.add_argument(
        '--max_opt_iter',
        type=int,
        default=150,
        help='Maximum number of iterations for optimization')

    args = parser.parse_args()

    if args.verbose:
        print('Current arguments: {0}'.format(args))

    dtype = 'float32'
    if args.verbose:
        print('Loading embeddings data...')

    # Read input embeddings
    emb1file = open(args.emb1,
                    encoding=args.encoding,
                    errors='surrogateescape')
    emb2file = open(args.emb2,
                    encoding=args.encoding,
                    errors='surrogateescape')
    emb1_words, x = embeddings.read(emb1file, max_voc=0, dtype=dtype)
    emb2_words, z = embeddings.read(emb2file, max_voc=0, dtype=dtype)

    # Build word to index map
    emb1_word2ind = {word: i for i, word in enumerate(emb1_words)}
    emb2_word2ind = {word: i for i, word in enumerate(emb2_words)}

    noov = 0
    emb1_indices = []
    emb2_indices = []
    f = open(args.dictionary, encoding=args.encoding, errors='surrogateescape')
    for line in f:
        emb1, emb2 = line.split()
        try:
            emb1_ind = emb1_word2ind[emb1]
            emb2_ind = emb2_word2ind[emb2]
            emb1_indices.append(emb1_ind)
            emb2_indices.append(emb2_ind)
        except KeyError:
            noov += 1
            if args.verbose:
                print('WARNING: OOV dictionary entry ({0} - {1})'.format(
                    emb1, emb2))  #, file=sys.stderr
    f.close()
    if args.verbose:
        print('Number of embedding pairs having at least one OOV: {}'.format(
            noov))
    emb1_indices = emb1_indices
    emb2_indices = emb2_indices
    if args.verbose:
        print('Normalizing embeddings...')

    # STEP 0: Normalization
    if len(args.normalize) > 0:
        x = normalize_emb(x, args.normalize[0])
        z = normalize_emb(z, args.normalize[1])

    # Step 1: Optimization
    if args.verbose:
        print('Beginning Optimization')
    start_time = time.time()
    x_count = len(set(emb1_indices))
    z_count = len(set(emb2_indices))

    # Filter out uniq values
    map_dict_emb1 = {}
    map_dict_emb2 = {}
    I = 0
    uniq_emb1 = []
    uniq_emb2 = []
    for i in range(len(emb1_indices)):
        if emb1_indices[i] not in map_dict_emb1.keys():
            map_dict_emb1[emb1_indices[i]] = I
            I += 1
            uniq_emb1.append(emb1_indices[i])
    J = 0
    for j in range(len(emb2_indices)):
        if emb2_indices[j] not in map_dict_emb2.keys():
            map_dict_emb2[emb2_indices[j]] = J
            J += 1
            uniq_emb2.append(emb2_indices[j])

    # Creating dictionary matrix
    row = list(range(0, x_count))
    col = list(range(0, x_count))
    data = [1 for i in range(0, x_count)]
    print(f"Counts: {x_count}, {z_count}")
    A = coo_matrix((data, (row, col)), shape=(x_count, z_count))

    np.random.seed(0)
    Lambda = args.l2_reg

    U1 = TT.matrix()
    U2 = TT.matrix()
    B = TT.matrix()

    Xemb1 = x[uniq_emb1]
    Zemb2 = z[uniq_emb2]
    del x, z
    gc.collect()

    Kx, Kz = Xemb1, Zemb2
    XtAZ = Kx.T.dot(A.dot(Kz))
    XtX = Kx.T.dot(Kx)
    ZtZ = Kz.T.dot(Kz)
    AA = np.sum(A * A)

    W = (U1.dot(B)).dot(U2.T)
    regularizer = 0.5 * Lambda * (TT.sum(B**2))
    sXtX = shared(XtX)
    sZtZ = shared(ZtZ)
    sXtAZ = shared(XtAZ)

    cost = regularizer
    wtxtxw = W.T.dot(sXtX.dot(W))
    wtxtxwztz = wtxtxw.dot(sZtZ)
    cost += TT.nlinalg.trace(wtxtxwztz)
    cost += -2 * TT.sum(W * sXtAZ)
    cost += shared(AA)

    solver = ConjugateGradient(maxtime=args.max_opt_time,
                               maxiter=args.max_opt_iter)

    manifold = Product([
        Stiefel(Kx.shape[1], Kx.shape[1]),
        Stiefel(Kz.shape[1], Kz.shape[1]),
        PositiveDefinite(Kx.shape[1])
    ])
    problem = Problem(manifold=manifold,
                      cost=cost,
                      arg=[U1, U2, B],
                      verbosity=3)
    wopt = solver.solve(problem)
    print(f"Problem solved ...")

    w = wopt
    U1 = w[0]
    U2 = w[1]
    B = w[2]

    print(f"Model copied ...")

    gc.collect()

    # Step 2: Transformation
    xw = Kx.dot(U1).dot(scipy.linalg.sqrtm(B))
    zw = Kz.dot(U2).dot(scipy.linalg.sqrtm(B))
    print(f"Transformation done ...")

    end_time = time.time()
    if args.verbose:
        print('Completed training in {0:.2f} seconds'.format(end_time -
                                                             start_time))

    del Kx, Kz, B, U1, U2
    gc.collect()

    ### Save the GeoMM embeddings if requested
    xw_n = embeddings.length_normalize(xw)
    zw_n = embeddings.length_normalize(zw)

    del xw, zw
    gc.collect()

    if args.geomm_embeddings_path is not None:
        os.makedirs(args.geomm_embeddings_path, exist_ok=True)

        out_emb_fname = os.path.join(args.geomm_embeddings_path, 'emb1.vec')
        new_emb1_words = []
        for id in uniq_emb1:
            new_emb1_words.append(emb1_words[id])
        with open(out_emb_fname, 'w', encoding=args.encoding) as outfile:
            embeddings.write(new_emb1_words, xw_n, outfile)

        new_emb2_words = []
        for id in uniq_emb2:
            new_emb2_words.append(emb2_words[id])
        out_emb_fname = os.path.join(args.geomm_embeddings_path, 'emb2.vec')
        with open(out_emb_fname, 'w', encoding=args.encoding) as outfile:
            embeddings.write(new_emb2_words, zw_n, outfile)

    exit(0)
示例#5
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(
        description='Map the source embeddings into the target embedding space'
    )
    parser.add_argument('src_input', help='the input source embeddings')
    parser.add_argument('trg_input', help='the input target embeddings')
    parser.add_argument('src_output', help='the output source embeddings')
    parser.add_argument('trg_output', help='the output target embeddings')
    parser.add_argument(
        '--encoding',
        default='utf-8',
        help='the character encoding for input/output (defaults to utf-8)')
    mapping_group = parser.add_argument_group(
        'mapping arguments', 'Basic embedding mapping arguments (EMNLP 2016)')
    mapping_group.add_argument(
        '-d',
        '--dictionary',
        default=sys.stdin.fileno(),
        help='the training dictionary file (defaults to stdin)')
    mapping_group.add_argument(
        '--normalize',
        choices=['unit', 'center', 'unitdim', 'centeremb'],
        nargs='*',
        default=[],
        help='the normalization actions to perform in order')
    mapping_group.add_argument(
        '-c',
        '--orthogonal',
        dest='orthogonal',
        action='store_true',
        help='use orthogonal constrained mapping (default)')
    mapping_group.add_argument('-u',
                               '--unconstrained',
                               dest='orthogonal',
                               action='store_false',
                               help='use unconstrained mapping')
    parser.set_defaults(orthogonal=True)
    self_learning_group = parser.add_argument_group(
        'self-learning arguments',
        'Optional arguments for self-learning (ACL 2017)')
    self_learning_group.add_argument('--self_learning',
                                     action='store_true',
                                     help='enable self-learning')
    self_learning_group.add_argument(
        '--direction',
        choices=['forward', 'backward', 'union'],
        default='forward',
        help='the direction for dictionary induction (defaults to forward)')
    self_learning_group.add_argument(
        '--numerals',
        action='store_true',
        help=
        'use latin numerals (i.e. words matching [0-9]+) as the seed dictionary'
    )
    self_learning_group.add_argument(
        '--threshold',
        default=0.000001,
        type=float,
        help='the convergence threshold (defaults to 0.000001)')
    self_learning_group.add_argument(
        '--validation',
        default=None,
        help='a dictionary file for validation at each iteration')
    self_learning_group.add_argument(
        '--log', help='write to a log file in tsv format at each iteration')
    self_learning_group.add_argument(
        '-v',
        '--verbose',
        action='store_true',
        help='write log information to stderr at each iteration')
    args = parser.parse_args()

    # Read input embeddings
    srcfile = open(args.src_input,
                   encoding=args.encoding,
                   errors='surrogateescape')
    trgfile = open(args.trg_input,
                   encoding=args.encoding,
                   errors='surrogateescape')
    src_words, x = embeddings.read(srcfile)
    trg_words, z = embeddings.read(trgfile)

    # Build word to index map
    src_word2ind = {word: i for i, word in enumerate(src_words)}
    trg_word2ind = {word: i for i, word in enumerate(trg_words)}

    # Build training dictionary
    src_indices = []
    trg_indices = []
    if args.numerals:
        if args.dictionary != sys.stdin.fileno():
            print('WARNING: Using numerals instead of the training dictionary',
                  file=sys.stderr)
        numeral_regex = re.compile('^[0-9]+$')
        src_numerals = {
            word
            for word in src_words if numeral_regex.match(word) is not None
        }
        trg_numerals = {
            word
            for word in trg_words if numeral_regex.match(word) is not None
        }
        numerals = src_numerals.intersection(trg_numerals)
        for word in numerals:
            src_indices.append(src_word2ind[word])
            trg_indices.append(trg_word2ind[word])
    else:
        f = open(args.dictionary,
                 encoding=args.encoding,
                 errors='surrogateescape')
        for line in f:
            src, trg = line.split()
            try:
                src_ind = src_word2ind[src]
                trg_ind = trg_word2ind[trg]
                src_indices.append(src_ind)
                trg_indices.append(trg_ind)
            except KeyError:
                print('WARNING: OOV dictionary entry ({0} - {1})'.format(
                    src, trg),
                      file=sys.stderr)

    # Read validation dictionary
    if args.validation is not None:
        f = open(args.validation,
                 encoding=args.encoding,
                 errors='surrogateescape')
        validation = collections.defaultdict(set)
        oov = set()
        vocab = set()
        for line in f:
            src, trg = line.split()
            try:
                src_ind = src_word2ind[src]
                trg_ind = trg_word2ind[trg]
                validation[src_ind].add(trg_ind)
                vocab.add(src)
            except KeyError:
                pass
                oov.add(src)
        oov -= vocab  # If one of the translation options is in the vocabulary, then the entry is not an oov
        validation_coverage = len(validation) / (len(validation) + len(oov))

    # Create log file
    if args.log:
        log = open(args.log,
                   mode='w',
                   encoding=args.encoding,
                   errors='surrogateescape')

    # Normalize embeddings
    for action in args.normalize:
        if action == 'unit':
            x = embeddings.length_normalize(x)
            z = embeddings.length_normalize(z)
        elif action == 'center':
            x = embeddings.mean_center(x)
            z = embeddings.mean_center(z)
        elif action == 'unitdim':
            x = embeddings.length_normalize_dimensionwise(x)
            z = embeddings.length_normalize_dimensionwise(z)
        elif action == 'centeremb':
            x = embeddings.mean_center_embeddingwise(x)
            z = embeddings.mean_center_embeddingwise(z)

    # Training loop
    prev_objective = objective = -100.
    it = 1
    t = time.time()
    while it == 1 or objective - prev_objective >= args.threshold:

        # Update the embedding mapping
        if args.orthogonal:  # orthogonal mapping
            u, s, vt = np.linalg.svd(np.dot(z[trg_indices].T, x[src_indices]))
            w = np.dot(vt.T, u.T)
        else:  # unconstrained mapping
            x_pseudoinv = np.dot(
                np.linalg.inv(np.dot(x[src_indices].T, x[src_indices])),
                x[src_indices].T)
            w = np.dot(x_pseudoinv, z[trg_indices])
        xw = x.dot(w)

        # Self-learning
        if args.self_learning:

            # Update the training dictionary
            best_sim_forward = np.full(x.shape[0], -100.)
            src_indices_forward = range(x.shape[0])
            trg_indices_forward = np.zeros(x.shape[0], dtype=int)
            best_sim_backward = np.full(z.shape[0], -100.)
            src_indices_backward = np.zeros(z.shape[0], dtype=int)
            trg_indices_backward = range(z.shape[0])
            for i in range(0, x.shape[0], MAX_DIM_X):
                for j in range(0, z.shape[0], MAX_DIM_Z):
                    sim = xw[i:i + MAX_DIM_X].dot(z[j:j + MAX_DIM_Z].T)
                    for k in range(sim.shape[0]):
                        l = sim[k].argmax()
                        if sim[k, l] > best_sim_forward[i + k]:
                            best_sim_forward[i + k] = sim[k, l]
                            trg_indices_forward[i + k] = j + l
                    if args.direction in (
                            'backward', 'union'):  # Slow, only do if necessary
                        for l in range(sim.shape[1]):
                            k = sim[:, l].argmax()
                            if sim[k, l] > best_sim_backward[j + l]:
                                best_sim_backward[j + l] = sim[k, l]
                                src_indices_backward[j + l] = i + k
                    sim = None
            if args.direction == 'forward':
                src_indices = src_indices_forward
                trg_indices = trg_indices_forward
            elif args.direction == 'backward':
                src_indices = src_indices_backward
                trg_indices = trg_indices_backward
            elif args.direction == 'union':
                src_indices = np.concatenate(
                    (src_indices_forward, src_indices_backward))
                trg_indices = np.concatenate(
                    (trg_indices_forward, trg_indices_backward))

            # Objective function evaluation
            prev_objective = objective
            if args.direction == 'forward':
                objective = np.mean(best_sim_forward)
            elif args.direction == 'backward':
                objective = np.mean(best_sim_backward)
            elif args.direction == 'union':
                objective = (np.mean(best_sim_forward) +
                             np.mean(best_sim_backward)) / 2

            # Accuracy and similarity evaluation in validation
            if args.validation is not None:
                accuracy = np.mean([
                    1 if trg_indices_forward[src] in trg else 0
                    for src, trg in validation.items()
                ])
                similarity = np.mean([
                    np.max(z[list(trg)].dot(xw[src]))
                    for src, trg in validation.items()
                ])

            # Logging
            duration = time.time() - t
            if args.verbose:
                print(file=sys.stderr)
                print('ITERATION {0} ({1:.2f}s)'.format(it, duration),
                      file=sys.stderr)
                print('\t- Objective:        {0:9.4f}%'.format(100 *
                                                               objective),
                      file=sys.stderr)
                if args.validation is not None:
                    print('\t- Val. similarity:  {0:9.4f}%'.format(100 *
                                                                   similarity),
                          file=sys.stderr)
                    print('\t- Val. accuracy:    {0:9.4f}%'.format(100 *
                                                                   accuracy),
                          file=sys.stderr)
                    print('\t- Val. coverage:    {0:9.4f}%'.format(
                        100 * validation_coverage),
                          file=sys.stderr)
                sys.stderr.flush()
            if args.log is not None:
                val = '{0:.6f}\t{1:.6f}\t{2:.6f}'.format(
                    100 * similarity, 100 * accuracy, 100 *
                    validation_coverage) if args.validation is not None else ''
                print('{0}\t{1:.6f}\t{2}\t{3:.6f}'.format(
                    it, 100 * objective, val, duration),
                      file=log)
                log.flush()

        t = time.time()
        it += 1

    # Write mapped embeddings
    srcfile = open(args.src_output,
                   mode='w',
                   encoding=args.encoding,
                   errors='surrogateescape')
    trgfile = open(args.trg_output,
                   mode='w',
                   encoding=args.encoding,
                   errors='surrogateescape')
    embeddings.write(src_words, xw, srcfile)
    embeddings.write(trg_words, z, trgfile)
    srcfile.close()
    trgfile.close()
示例#6
0
def translate(words_to_translate,
              src_emb_info,
              tgt_emb_info,
              retrieval_method="csls",
              csls_k=10,
              batch_size=2500):

    print('Hello')
    sys.stdout.flush()
    # Read source embeddings
    src_words, x = src_emb_info
    src_word2ind = build_w2i(src_words)

    # Read target embeddings
    tgt_words, z = tgt_emb_info
    tgt_word2ind = build_w2i(tgt_words)

    xw = embeddings.length_normalize(x)
    zw = embeddings.length_normalize(z)

    all_words = []
    trans_words = []
    trans_idx = []
    oov = set()

    for w in words_to_translate:
        try:
            all_words.append(w)
            w_ind = src_word2ind[w]
            trans_words.append(w)
            trans_idx.append(w_ind)
        except KeyError:
            oov.add(w)

    print(len(all_words))
    print(len(trans_words))
    print(len(trans_idx))
    print(len(oov))
    src = trans_idx

    print('Number of words to translate: {}'.format(len(src)))

    translation = collections.defaultdict(int)
    translation5 = collections.defaultdict(list)
    translation10 = collections.defaultdict(list)

    if retrieval_method == 'nn':  # Standard nearest neighbor
        for i in range(0, len(src), batch_size):
            j = min(i + batch_size, len(src))
            similarities = xw[src[i:j]].dot(zw.T)
            nn = similarities.argmax(axis=1).tolist()
            #             similarities_idx = similarities.argsort(axis=1)
            #             nn5 = similarities_idx[:,-5:]
            #             nn10 = similarities_idx[:,-10:]

            for k in range(j - i):
                translation[src[i + k]] = nn[k]
#                 translation5[src[i+k]] = nn5[k]
#                 translation10[src[i+k]] = nn10[k]

    elif retrieval_method == 'csls':
        t = time.time()
        nbrhood_x = np.zeros(xw.shape[0])
        nbrhood_z = np.zeros(zw.shape[0])
        nbrhood_z2 = cp.zeros(zw.shape[0])
        print('Computing X Neighbourhood')
        sys.stdout.flush()
        # batch_size=1000
        batch_num = 1
        for i in range(0, len(src), batch_size):
            j = min(i + batch_size, len(src))
            similarities = xw[src[i:j]].dot(zw.T)
            # similarities_x = np.sort(similarities, axis=1)
            similarities_x = -1 * np.partition(
                -1 * similarities, csls_k - 1, axis=1)
            #similarities_x = -1*cp.partition(-1*cp.dot(cp.asarray(xw[src[i:j]]),cp.transpose(cp.asarray(zw))),csls_k-1 ,axis=1)[:,:csls_k]
            nbrhood_x[src[i:j]] = np.mean(similarities_x[:, :csls_k], axis=1)
            print('Completed batch {0} in {1}'.format(batch_num,
                                                      time.time() - t))
            sys.stdout.flush()
            batch_num += 1
        print('Completed in {0} seconds'.format(time.time() - t))

        print('Computing Z Neighbourhood')
        sys.stdout.flush()

        batch_num = 1
        for i in range(0, zw.shape[0], batch_size):
            j = min(i + batch_size, zw.shape[0])
            similarities = -1 * cp.partition(
                -1 * cp.dot(cp.asarray(zw[i:j]), cp.transpose(cp.asarray(xw))),
                csls_k - 1,
                axis=1)[:, :csls_k]
            nbrhood_z2[i:j] = (cp.mean(similarities[:, :csls_k], axis=1))
            print('Completed batch {0} in {1}'.format(batch_num,
                                                      time.time() - t))
            sys.stdout.flush()
            batch_num += 1
        # gc.collect()
        # t=time.time()
        nbrhood_z = cp.asnumpy(nbrhood_z2)
        # ipdb.set_trace()
        print(time.time() - t)
        print('Computing nearest neighbours')
        sys.stdout.flush()
        csls_alpha = 1
        batch_num = 1
        for i in range(0, len(src), batch_size):
            j = min(i + batch_size, len(src))
            similarities = xw[src[i:j]].dot(zw.T)
            similarities = np.transpose(np.transpose(2*similarities) \
                                        - csls_alpha*nbrhood_x[src[i:j]]) \
                                        - csls_alpha*nbrhood_z

            nn = similarities.argmax(axis=1).tolist()
            #             similarities = np.argsort((similarities),axis=1)
            #             nn5 = (similarities[:,-5:])
            #             nn10 = (similarities[:,-10:])
            for k in range(j - i):
                translation[src[i + k]] = nn[k]
#                 translation5[src[i+k]] = nn5[k]
#                 translation10[src[i+k]] = nn10[k]

            print('Completed batch {0} in {1}'.format(batch_num,
                                                      time.time() - t))
            sys.stdout.flush()
            batch_num += 1
        print('Completed in {0} seconds'.format(time.time() - t))
        sys.stdout.flush()

    # get translations
    trans_pairs = []
    for w in trans_words:
        trans = ''
        if w in src_word2ind:
            trans = tgt_words[translation[src_word2ind[w]]]
            trans_pairs.append((w, trans))

    return dict(trans_pairs)
示例#7
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(description='Evaluate embeddings of two languages in a shared space in word translation induction')
    parser.add_argument('src_embeddings', help='the source language embeddings')
    parser.add_argument('trg_embeddings', help='the target language embeddings')
    parser.add_argument('-d', '--dictionary', default=sys.stdin.fileno(), help='the test dictionary file (defaults to stdin)')
    parser.add_argument('--retrieval', default='nn', choices=['nn', 'invnn', 'invsoftmax', 'csls'], help='the retrieval method (nn: standard nearest neighbor; invnn: inverted nearest neighbor; invsoftmax: inverted softmax; csls: cross-domain similarity local scaling)')
    parser.add_argument('--inv_temperature', default=1, type=float, help='the inverse temperature (only compatible with inverted softmax)')
    parser.add_argument('--inv_sample', default=None, type=int, help='use a random subset of the source vocabulary for the inverse computations (only compatible with inverted softmax)')
    parser.add_argument('-k', '--neighborhood', default=10, type=int, help='the neighborhood size (only compatible with csls)')
    parser.add_argument('--dot', action='store_true', help='use the dot product in the similarity computations instead of the cosine')
    parser.add_argument('--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)')
    parser.add_argument('--seed', type=int, default=0, help='the random seed')
    parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp32', help='the floating-point precision (defaults to fp32)')
    parser.add_argument('--cuda', action='store_true', help='use cuda (requires cupy)')
    args = parser.parse_args()

    # Choose the right dtype for the desired precision
    if args.precision == 'fp16':
        dtype = 'float16'
    elif args.precision == 'fp32':
        dtype = 'float32'
    elif args.precision == 'fp64':
        dtype = 'float64'

    # Read input embeddings
    srcfile = open(args.src_embeddings, encoding=args.encoding, errors='surrogateescape')
    trgfile = open(args.trg_embeddings, encoding=args.encoding, errors='surrogateescape')
    src_words, x = embeddings.read(srcfile, dtype=dtype)
    trg_words, z = embeddings.read(trgfile, dtype=dtype)
    print('embeddings read')

    # NumPy/CuPy management
    if args.cuda:
        if not supports_cupy():
            print('ERROR: Install CuPy for CUDA support', file=sys.stderr)
            sys.exit(-1)
        xp = get_cupy()
        x = xp.asarray(x)
        z = xp.asarray(z)
    else:
        xp = np
    xp.random.seed(args.seed)

    # Length normalize embeddings so their dot product effectively computes the cosine similarity
    if not args.dot:
        embeddings.length_normalize(x)
        embeddings.length_normalize(z)

    # Build word to index map
    src_word2ind = {word: i for i, word in enumerate(src_words)}
    trg_word2ind = {word: i for i, word in enumerate(trg_words)}

    # Read dictionary and compute coverage
    f = open(args.dictionary, encoding=args.encoding, errors='surrogateescape')
    src2trg = collections.defaultdict(set)
    oov = set()
    vocab = set()
    for line in f:
        src, trg = line.split()
        try:
            src_ind = src_word2ind[src]
            trg_ind = trg_word2ind[trg]
            src2trg[src_ind].add(trg_ind)
            vocab.add(src)
        except KeyError:
            oov.add(src)
    src = list(src2trg.keys())
    oov -= vocab  # If one of the translation options is in the vocabulary, then the entry is not an oov
    coverage = len(src2trg) / (len(src2trg) + len(oov))
    print(f'dictionary read')

    # Find translations
    translation = collections.defaultdict(int)
    if args.retrieval == 'nn':  # Standard nearest neighbor
        for i in range(0, len(src), BATCH_SIZE):
            j = min(i + BATCH_SIZE, len(src))
            similarities = x[src[i:j]].dot(z.T)
            nn = similarities.argmax(axis=1).tolist()
            for k in range(j-i):
                translation[src[i+k]] = nn[k]
    elif args.retrieval == 'invnn':  # Inverted nearest neighbor
        best_rank = np.full(len(src), x.shape[0], dtype=int)
        best_sim = np.full(len(src), -100, dtype=dtype)
        for i in range(0, z.shape[0], BATCH_SIZE):
            j = min(i + BATCH_SIZE, z.shape[0])
            similarities = z[i:j].dot(x.T)
            ind = (-similarities).argsort(axis=1)
            ranks = asnumpy(ind.argsort(axis=1)[:, src])
            sims = asnumpy(similarities[:, src])
            for k in range(i, j):
                for l in range(len(src)):
                    rank = ranks[k-i, l]
                    sim = sims[k-i, l]
                    if rank < best_rank[l] or (rank == best_rank[l] and sim > best_sim[l]):
                        best_rank[l] = rank
                        best_sim[l] = sim
                        translation[src[l]] = k
    elif args.retrieval == 'invsoftmax':  # Inverted softmax
        sample = xp.arange(x.shape[0]) if args.inv_sample is None else xp.random.randint(0, x.shape[0], args.inv_sample)
        partition = xp.zeros(z.shape[0])
        for i in range(0, len(sample), BATCH_SIZE):
            j = min(i + BATCH_SIZE, len(sample))
            partition += xp.exp(args.inv_temperature*z.dot(x[sample[i:j]].T)).sum(axis=1)
        for i in range(0, len(src), BATCH_SIZE):
            j = min(i + BATCH_SIZE, len(src))
            p = xp.exp(args.inv_temperature*x[src[i:j]].dot(z.T)) / partition
            nn = p.argmax(axis=1).tolist()
            for k in range(j-i):
                translation[src[i+k]] = nn[k]
    elif args.retrieval == 'csls':  # Cross-domain similarity local scaling
        knn_sim_bwd = xp.zeros(z.shape[0])
        for i in range(0, z.shape[0], BATCH_SIZE):
            j = min(i + BATCH_SIZE, z.shape[0])
            knn_sim_bwd[i:j] = topk_mean(z[i:j].dot(x.T), k=args.neighborhood, inplace=True)
        for i in range(0, len(src), BATCH_SIZE):
            j = min(i + BATCH_SIZE, len(src))
            similarities = 2*x[src[i:j]].dot(z.T) - knn_sim_bwd  # Equivalent to the real CSLS scores for NN
            nn = similarities.argmax(axis=1).tolist()
            for k in range(j-i):
                translation[src[i+k]] = nn[k]

    # Compute accuracy
    accuracy = np.mean([1 if translation[i] in src2trg[i] else 0 for i in src])
    print('Coverage:{0:7.2%}  Accuracy:{1:7.2%}'.format(coverage, accuracy))
示例#8
0
def translate(src_emb_fname,
              tgt_emb_fname,
              trans_tgt_fname,
              trans_src_fname=None,
              retrieval_method="csls",
              csls_k=10,
              batch_size=2500):

    print('Loading train data...')

    srcfile = open(src_emb_fname,
                   'r',
                   encoding='utf-8',
                   errors='surrogateescape')
    tgtfile = open(tgt_emb_fname,
                   'r',
                   encoding='utf-8',
                   errors='surrogateescape')

    # Read source embeddings
    src_words, x = embeddings.read(srcfile, max_voc=0, dtype='float32')
    src_word2ind = {word: i for i, word in enumerate(src_words)}

    # Read target embeddings
    tgt_words, z = embeddings.read(tgtfile, max_voc=0, dtype='float32')
    tgt_word2ind = {word: i for i, word in enumerate(tgt_words)}

    srcfile.close()
    tgtfile.close()

    xw = embeddings.length_normalize(x)
    zw = embeddings.length_normalize(z)

    all_words = []
    trans_words = []
    trans_idx = []
    oov = set()
    if trans_src_fname is not None:
        with open(trans_src_fname,
                  'r',
                  encoding='utf-8',
                  errors='surrogateescape') as trans_src_file:
            for line in trans_src_file:
                try:
                    #w=line.strip().lower()
                    w = line.strip()
                    all_words.append(w)
                    w_ind = src_word2ind[w]
                    trans_words.append(w)
                    trans_idx.append(w_ind)
                except KeyError:
                    oov.add(w)
    else:
        all_words = src_words
        trans_words = src_words
        trans_idx = list(range(len(src_words)))
        oov = set()

    print(len(all_words))
    print(len(trans_words))
    print(len(trans_idx))
    print(len(oov))
    src = trans_idx

    translation = collections.defaultdict(int)
    translation5 = collections.defaultdict(list)
    translation10 = collections.defaultdict(list)

    if retrieval_method == 'nn':  # Standard nearest neighbor
        for i in range(0, len(src), batch_size):
            j = min(i + batch_size, len(src))
            similarities = xw[src[i:j]].dot(zw.T)
            nn = similarities.argmax(axis=1).tolist()
            similarities_idx = similarities.argsort(axis=1)
            nn5 = similarities_idx[:, -5:]
            nn10 = similarities_idx[:, -10:]

            for k in range(j - i):
                translation[src[i + k]] = nn[k]
                translation5[src[i + k]] = nn5[k]
                translation10[src[i + k]] = nn10[k]

    elif retrieval_method == 'csls':
        t = time.time()
        nbrhood_x = np.zeros(xw.shape[0])
        nbrhood_z = np.zeros(zw.shape[0])
        nbrhood_z2 = cp.zeros(zw.shape[0])
        print('Computing X Neighbourhood')
        # batch_size=1000
        for i in range(0, len(src), batch_size):
            j = min(i + batch_size, len(src))
            similarities = xw[src[i:j]].dot(zw.T)
            # similarities_x = np.sort(similarities, axis=1)
            similarities_x = -1 * np.partition(
                -1 * similarities, csls_k - 1, axis=1)
            #similarities_x = -1*cp.partition(-1*cp.dot(cp.asarray(xw[src[i:j]]),cp.transpose(cp.asarray(zw))),csls_k-1 ,axis=1)[:,:csls_k]
            nbrhood_x[src[i:j]] = np.mean(similarities_x[:, :csls_k], axis=1)
        print('Completed in {0} seconds'.format(time.time() - t))
        print('Computing Z Neighbourhood')

        batch_num = 1
        for i in range(0, zw.shape[0], batch_size):
            j = min(i + batch_size, zw.shape[0])
            similarities = -1 * cp.partition(
                -1 * cp.dot(cp.asarray(zw[i:j]), cp.transpose(cp.asarray(xw))),
                csls_k - 1,
                axis=1)[:, :csls_k]
            nbrhood_z2[i:j] = (cp.mean(similarities[:, :csls_k], axis=1))
            print('Completed batch {0} in {1}'.format(batch_num,
                                                      time.time() - t))
            batch_num += 1
        # gc.collect()
        # t=time.time()
        nbrhood_z = cp.asnumpy(nbrhood_z2)
        # ipdb.set_trace()
        print(time.time() - t)
        csls_alpha = 1
        for i in range(0, len(src), batch_size):
            j = min(i + batch_size, len(src))
            similarities = xw[src[i:j]].dot(zw.T)
            similarities = np.transpose(
                np.transpose(2 * similarities) -
                csls_alpha * nbrhood_x[src[i:j]]) - csls_alpha * nbrhood_z
            nn = similarities.argmax(axis=1).tolist()
            print(time.time() - t)
            similarities = np.argsort((similarities), axis=1)

            nn5 = (similarities[:, -5:])
            nn10 = (similarities[:, -10:])
            for k in range(j - i):
                translation[src[i + k]] = nn[k]
                translation5[src[i + k]] = nn5[k]
                translation10[src[i + k]] = nn10[k]
        print('Completed in {0} seconds'.format(time.time() - t))

    ### write the translations (1 pair per line format)
    with open(trans_tgt_fname, 'w', encoding='utf-8',
              errors='surrogateescape') as trans_tgt_file:
        for w in trans_words:
            trans = ''
            if w in src_word2ind:
                trans = tgt_words[translation[src_word2ind[w]]]
            trans_tgt_file.write('{}\t{}\n'.format(w, trans))
示例#9
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(
        description='Evaluate embeddings in word similarity/relatedness')
    parser.add_argument('src_embeddings',
                        help='the source language embeddings')
    parser.add_argument('trg_embeddings',
                        nargs='?',
                        help='the target language embeddings')
    parser.add_argument('-i',
                        '--input',
                        default=[sys.stdin.fileno()],
                        nargs='+',
                        help='the input datasets (defaults to stdin)')
    parser.add_argument('-l',
                        '--lowercase',
                        action='store_true',
                        help='lowercase the words in the test files')
    parser.add_argument('--backoff',
                        default=None,
                        type=float,
                        help='use a backoff similarity score for OOV entries')
    parser.add_argument(
        '--encoding',
        default='utf-8',
        help='the character encoding for input/output (defaults to utf-8)')
    parser.add_argument('--precision',
                        choices=['fp16', 'fp32', 'fp64'],
                        default='fp64',
                        help='the floating-point precision (defaults to fp64)')
    parser.add_argument(
        '--sim',
        nargs='*',
        help='the names of the datasets to include in the similarity results')
    parser.add_argument(
        '--rel',
        nargs='*',
        help='the names of the datasets to include in the relatedness results')
    parser.add_argument(
        '--all',
        nargs='*',
        help='the names of the datasets to include in the total results')
    args = parser.parse_args()

    # Choose the right dtype for the desired precision
    if args.precision == 'fp16':
        dtype = 'float16'
    elif args.precision == 'fp32':
        dtype = 'float32'
    elif args.precision == 'fp64':
        dtype = 'float64'

    # Parse test files
    word_pairs = []
    golds = []
    for filename in args.input:
        f = open(filename, encoding=args.encoding, errors='surrogateescape')
        word_pairs.append([])
        golds.append([])
        for line in f:
            if args.lowercase:
                line = line.lower()
            src, trg, score = line.split('\t')
            word_pairs[-1].append((src, trg))
            golds[-1].append(float(score))

    # Build vocabularies
    src_vocab = {pair[0] for pairs in word_pairs for pair in pairs}
    trg_vocab = {pair[1] for pairs in word_pairs for pair in pairs}

    # Read embeddings
    srcfile = open(args.src_embeddings,
                   encoding=args.encoding,
                   errors='surrogateescape')
    trgfile = open(args.src_embeddings
                   if args.trg_embeddings is None else args.trg_embeddings,
                   encoding=args.encoding,
                   errors='surrogateescape')
    src_words, src_matrix = embeddings.read(srcfile,
                                            vocabulary=src_vocab,
                                            dtype=dtype)
    trg_words, trg_matrix = embeddings.read(trgfile,
                                            vocabulary=trg_vocab,
                                            dtype=dtype)

    # Length normalize embeddings so their dot product effectively computes the cosine similarity
    src_matrix = embeddings.length_normalize(src_matrix)
    trg_matrix = embeddings.length_normalize(trg_matrix)

    # Build word to index map
    src_word2ind = {word: i for i, word in enumerate(src_words)}
    trg_word2ind = {word: i for i, word in enumerate(trg_words)}

    # Compute system scores and correlations
    results = []
    for i in range(len(golds)):
        system = []
        gold = []
        oov = 0
        for gold_score, (src, trg) in zip(golds[i], word_pairs[i]):
            try:
                cos = np.dot(src_matrix[src_word2ind[src]],
                             trg_matrix[trg_word2ind[trg]])
                system.append(cos)
                gold.append(gold_score)
            except KeyError:
                if args.backoff is None:
                    oov += 1
                else:
                    system.append(args.backoff)
                    gold.append(gold_score)
        name = os.path.splitext(os.path.basename(args.input[i]))[0]
        coverage = len(system) / (len(system) + oov)
        pearson = scipy.stats.pearsonr(gold, system)[0]
        spearman = scipy.stats.spearmanr(gold, system)[0]
        results.append((name, coverage, pearson, spearman))
        print('Coverage:{0:7.2%}  Pearson:{1:7.2%}  Spearman:{2:7.2%} | {3}'.
              format(coverage, pearson, spearman, name))

    # Compute and print total (averaged) results
    if len(results) > 1:
        print('-' * 80)
        if args.sim is not None:
            sim = list(zip(*[res for res in results if res[0] in args.sim]))
            print(
                'Coverage:{0:7.2%}  Pearson:{1:7.2%}  Spearman:{2:7.2%} | sim.'
                .format(np.mean(sim[1]), np.mean(sim[2]), np.mean(sim[3])))
        if args.rel is not None:
            rel = list(zip(*[res for res in results if res[0] in args.rel]))
            print(
                'Coverage:{0:7.2%}  Pearson:{1:7.2%}  Spearman:{2:7.2%} | rel.'
                .format(np.mean(rel[1]), np.mean(rel[2]), np.mean(rel[3])))
        if args.all is not None:
            results = [res for res in results if res[0] in args.all]
        results = list(zip(*results))
        print('Coverage:{0:7.2%}  Pearson:{1:7.2%}  Spearman:{2:7.2%} | all'.
              format(np.mean(results[1]), np.mean(results[2]),
                     np.mean(results[3])))
示例#10
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(description='Evaluate embeddings of two languages in a shared space in word translation induction')
    parser.add_argument('src_embeddings', help='the source language embeddings')
    parser.add_argument('trg_embeddings', help='the target language embeddings')
    parser.add_argument('-d', '--dictionary', default=sys.stdin.fileno(), help='the test dictionary file (defaults to stdin)')
    parser.add_argument('--dot', action='store_true', help='use the dot product in the similarity computations instead of the cosine')
    parser.add_argument('--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)')
    parser.add_argument('--output', type=str, help='file to write record of correct/incorrect translations')
    parser.add_argument('--identity', action='store_true', help='do evaluation as normal, but if identity translation is available, use it instead')
    parser.add_argument('--identity_dict', action='store_true', help='do evaluation as normal, but if identity translation is available within dictionary, use it instead')
    parser.add_argument('--identity_either', action='store_true', help='do evaluation as normal, but if identity translation is available AND correct, use it instead')
    args = parser.parse_args()

    # Read input embeddings
    srcfile = open(args.src_embeddings, encoding=args.encoding, errors='surrogateescape')
    trgfile = open(args.trg_embeddings, encoding=args.encoding, errors='surrogateescape')
    src_words, src_matrix = embeddings.read(srcfile)
    trg_words, trg_matrix = embeddings.read(trgfile)

    # Length normalize embeddings so their dot product effectively computes the cosine similarity
    if not args.dot:
        src_matrix = embeddings.length_normalize(src_matrix)
        trg_matrix = embeddings.length_normalize(trg_matrix)

    # Build word to index map
    src_word2ind = {word: i for i, word in enumerate(src_words)}
    trg_word2ind = {word: i for i, word in enumerate(trg_words)}

    # Read dictionary and compute coverage
    f = open(args.dictionary, encoding=args.encoding, errors='surrogateescape')
    src2trg = collections.defaultdict(set)
    dict_trgs = set()
    oov = set()
    vocab = set()
    for line in f:
        src, trg = line.split()
        try:
            src_ind = src_word2ind[src]
            trg_ind = trg_word2ind[trg]
            dict_trgs.add(trg)
            src2trg[src_ind].add(trg_ind)
            vocab.add(src)
        except KeyError:
            oov.add(src)
    oov -= vocab  # If one of the translation options is in the vocabulary, then the entry is not an oov
    coverage = len(src2trg) / (len(src2trg) + len(oov))

    if args.output:
      outputfile = open(args.output, mode='w',encoding=args.encoding, errors='surrogateescape')

    # Compute accuracy
    correct = 0
    src, trg = zip(*src2trg.items())
    for i in range(0, len(src2trg), BATCH_SIZE):
        j = min(i + BATCH_SIZE, len(src2trg))
        similarities = src_matrix[list(src[i:j])].dot(trg_matrix.T)
        nn = np.argmax(similarities, axis=1).tolist()
        for k in range(j-i):
            sw = src_words[src[i+k]]
            tws = [trg_words[t] for t in trg[i+k]]
            bCor = False
            guess = trg_words[nn[k]]
            if args.identity and sw in trg_word2ind: #able to use identity as guess
                guess = sw
                if sw in tws: #guessing identity is correct
                    bCor = True
                    correct += 1
                #else, guessing identity is incorrect
            elif args.identity_dict and sw in dict_trgs:
                guess = sw
                if sw in tws:
                    bCor = True
                    correct += 1
            elif nn[k] in trg[i+k]:
                correct += 1
                bCor = True
            elif args.identity_either and sw in tws:
                correct += 1
                bCor = True
                guess = sw

            if args.output:
                if bCor:
                    outputfile.write("Correct:{} {} {}\n".format(sw, guess, tws))
                else:
                    outputfile.write("Incorrect:{} {} {}\n".format(sw, guess, tws))

    print('Coverage:{0:7.2%}  Accuracy:{1:7.2%}'.format(coverage, correct / len(src2trg)))
示例#11
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(
        description='Map the source embeddings into the target embedding space'
    )
    parser.add_argument('src_input', help='the input source embeddings')
    parser.add_argument('trg_input', help='the input target embeddings')
    parser.add_argument('src_output', help='the output source embeddings')
    parser.add_argument('trg_output', help='the output target embeddings')
    parser.add_argument(
        '--encoding',
        default='utf-8',
        help='the character encoding for input/output (defaults to utf-8)')
    parser.add_argument('--precision',
                        choices=['fp16', 'fp32', 'fp64'],
                        default='fp64',
                        help='the floating-point precision (defaults to fp64)')
    parser.add_argument('--cuda',
                        action='store_true',
                        help='use cuda (requires cupy)')
    mapping_group = parser.add_argument_group(
        'mapping arguments', 'Basic embedding mapping arguments (EMNLP 2016)')
    mapping_group.add_argument(
        '-d',
        '--dictionary',
        default=sys.stdin.fileno(),
        help='the training dictionary file (defaults to stdin)')
    mapping_group.add_argument(
        '--normalize',
        choices=['unit', 'center', 'unitdim', 'centeremb'],
        nargs='*',
        default=[],
        help='the normalization actions to perform in order')
    mapping_type = mapping_group.add_mutually_exclusive_group()
    mapping_type.add_argument('-c',
                              '--orthogonal',
                              action='store_true',
                              help='use orthogonal constrained mapping')
    mapping_type.add_argument('-u',
                              '--unconstrained',
                              action='store_true',
                              help='use unconstrained mapping')
    self_learning_group = parser.add_argument_group(
        'self-learning arguments',
        'Optional arguments for self-learning (ACL 2017)')
    self_learning_group.add_argument('--self_learning',
                                     action='store_true',
                                     help='enable self-learning')
    self_learning_group.add_argument(
        '--direction',
        choices=['forward', 'backward', 'union'],
        default='forward',
        help='the direction for dictionary induction (defaults to forward)')
    self_learning_group.add_argument(
        '--numerals',
        action='store_true',
        help=
        'use latin numerals (i.e. words matching [0-9]+) as the seed dictionary'
    )
    self_learning_group.add_argument(
        '--threshold',
        default=0.000001,
        type=float,
        help='the convergence threshold (defaults to 0.000001)')
    self_learning_group.add_argument(
        '--validation',
        default=None,
        help='a dictionary file for validation at each iteration')
    self_learning_group.add_argument(
        '--log', help='write to a log file in tsv format at each iteration')
    self_learning_group.add_argument(
        '-v',
        '--verbose',
        action='store_true',
        help='write log information to stderr at each iteration')
    advanced_group = parser.add_argument_group(
        'advanced mapping arguments',
        'Advanced embedding mapping arguments (AAAI 2018)')
    advanced_group.add_argument('--whiten',
                                action='store_true',
                                help='whiten the embeddings')
    advanced_group.add_argument(
        '--src_reweight',
        type=float,
        default=0,
        nargs='?',
        const=1,
        help='re-weight the source language embeddings')
    advanced_group.add_argument(
        '--trg_reweight',
        type=float,
        default=0,
        nargs='?',
        const=1,
        help='re-weight the target language embeddings')
    advanced_group.add_argument(
        '--src_dewhiten',
        choices=['src', 'trg'],
        help='de-whiten the source language embeddings')
    advanced_group.add_argument(
        '--trg_dewhiten',
        choices=['src', 'trg'],
        help='de-whiten the target language embeddings')
    advanced_group.add_argument('--dim_reduction',
                                type=int,
                                default=0,
                                help='apply dimensionality reduction')
    args = parser.parse_args()

    # Check command line arguments
    if (args.src_dewhiten is not None
            or args.trg_dewhiten is not None) and not args.whiten:
        print('ERROR: De-whitening requires whitening first', file=sys.stderr)
        sys.exit(-1)

    # Choose the right dtype for the desired precision
    if args.precision == 'fp16':
        dtype = 'float16'
    elif args.precision == 'fp32':
        dtype = 'float32'
    elif args.precision == 'fp64':
        dtype = 'float64'

    # Read input embeddings
    srcfile = open(args.src_input,
                   encoding=args.encoding,
                   errors='surrogateescape')
    trgfile = open(args.trg_input,
                   encoding=args.encoding,
                   errors='surrogateescape')
    src_words, x = embeddings.read(srcfile, dtype=dtype)
    trg_words, z = embeddings.read(trgfile, dtype=dtype)

    # NumPy/CuPy management
    if args.cuda:
        if not supports_cupy():
            print('ERROR: Install CuPy for CUDA support', file=sys.stderr)
            sys.exit(-1)
        xp = get_cupy()
        x = xp.asarray(x)
        z = xp.asarray(z)
    else:
        xp = np

    # Build word to index map
    src_word2ind = {word: i for i, word in enumerate(src_words)}
    trg_word2ind = {word: i for i, word in enumerate(trg_words)}

    # Build training dictionary
    src_indices = []
    trg_indices = []
    if args.numerals:
        if args.dictionary != sys.stdin.fileno():
            print('WARNING: Using numerals instead of the training dictionary',
                  file=sys.stderr)
        numeral_regex = re.compile('^[0-9]+$')
        src_numerals = {
            word
            for word in src_words if numeral_regex.match(word) is not None
        }
        trg_numerals = {
            word
            for word in trg_words if numeral_regex.match(word) is not None
        }
        numerals = src_numerals.intersection(trg_numerals)
        for word in numerals:
            src_indices.append(src_word2ind[word])
            trg_indices.append(trg_word2ind[word])
    else:
        f = open(args.dictionary,
                 encoding=args.encoding,
                 errors='surrogateescape')
        for line in f:
            src, trg = line.split()
            try:
                src_ind = src_word2ind[src]
                trg_ind = trg_word2ind[trg]
                src_indices.append(src_ind)
                trg_indices.append(trg_ind)
            except KeyError:
                print('WARNING: OOV dictionary entry ({0} - {1})'.format(
                    src, trg),
                      file=sys.stderr)

    # Read validation dictionary
    if args.validation is not None:
        f = open(args.validation,
                 encoding=args.encoding,
                 errors='surrogateescape')
        validation = collections.defaultdict(set)
        oov = set()
        vocab = set()
        for line in f:
            src, trg = line.split()
            try:
                src_ind = src_word2ind[src]
                trg_ind = trg_word2ind[trg]
                validation[src_ind].add(trg_ind)
                vocab.add(src)
            except KeyError:
                oov.add(src)
        oov -= vocab  # If one of the translation options is in the vocabulary, then the entry is not an oov
        validation_coverage = len(validation) / (len(validation) + len(oov))

    # Create log file
    if args.log:
        log = open(args.log,
                   mode='w',
                   encoding=args.encoding,
                   errors='surrogateescape')

    # STEP 0: Normalization
    for action in args.normalize:
        if action == 'unit':
            x = embeddings.length_normalize(x)
            z = embeddings.length_normalize(z)
        elif action == 'center':
            x = embeddings.mean_center(x)
            z = embeddings.mean_center(z)
        elif action == 'unitdim':
            x = embeddings.length_normalize_dimensionwise(x)
            z = embeddings.length_normalize_dimensionwise(z)
        elif action == 'centeremb':
            x = embeddings.mean_center_embeddingwise(x)
            z = embeddings.mean_center_embeddingwise(z)

    # Training loop
    prev_objective = objective = -100.
    it = 1
    t = time.time()
    while it == 1 or objective - prev_objective >= args.threshold:

        # Update the embedding mapping
        if args.orthogonal:  # orthogonal mapping
            u, s, vt = xp.linalg.svd(z[trg_indices].T.dot(x[src_indices]))
            w = vt.T.dot(u.T)
            xw = x.dot(w)
            zw = z
        elif args.unconstrained:  # unconstrained mapping
            x_pseudoinv = xp.linalg.inv(x[src_indices].T.dot(
                x[src_indices])).dot(x[src_indices].T)
            w = x_pseudoinv.dot(z[trg_indices])
            xw = x.dot(w)
            zw = z
        else:  # advanced mapping
            xw = x
            zw = z

            # STEP 1: Whitening
            def whitening_transformation(m):
                u, s, vt = xp.linalg.svd(m, full_matrices=False)
                return vt.T.dot(xp.diag(1 / s)).dot(vt)

            if args.whiten:
                wx1 = whitening_transformation(xw[src_indices])
                wz1 = whitening_transformation(zw[trg_indices])
                xw = xw.dot(wx1)
                zw = zw.dot(wz1)

            # STEP 2: Orthogonal mapping
            wx2, s, wz2_t = xp.linalg.svd(xw[src_indices].T.dot(
                zw[trg_indices]))
            wz2 = wz2_t.T
            xw = xw.dot(wx2)
            zw = zw.dot(wz2)

            # STEP 3: Re-weighting
            xw *= s**args.src_reweight
            zw *= s**args.trg_reweight

            # STEP 4: De-whitening
            if args.src_dewhiten == 'src':
                xw = xw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2))
            elif args.src_dewhiten == 'trg':
                xw = xw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2))
            if args.trg_dewhiten == 'src':
                zw = zw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2))
            elif args.trg_dewhiten == 'trg':
                zw = zw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2))

            # STEP 5: Dimensionality reduction
            if args.dim_reduction > 0:
                xw = xw[:, :args.dim_reduction]
                zw = zw[:, :args.dim_reduction]

        # Self-learning
        if args.self_learning:

            # Update the training dictionary
            best_sim_forward = xp.full(x.shape[0], -100, dtype=dtype)
            src_indices_forward = xp.arange(x.shape[0])
            trg_indices_forward = xp.zeros(x.shape[0], dtype=int)
            best_sim_backward = xp.full(z.shape[0], -100, dtype=dtype)
            src_indices_backward = xp.zeros(z.shape[0], dtype=int)
            trg_indices_backward = xp.arange(z.shape[0])
            for i in range(0, x.shape[0], MAX_DIM_X):
                j = min(x.shape[0], i + MAX_DIM_X)
                for k in range(0, z.shape[0], MAX_DIM_Z):
                    l = min(z.shape[0], k + MAX_DIM_Z)
                    sim = xw[i:j].dot(zw[k:l].T)
                    if args.direction in ('forward', 'union'):
                        ind = sim.argmax(axis=1)
                        val = sim[xp.arange(sim.shape[0]), ind]
                        ind += k
                        mask = (val > best_sim_forward[i:j])
                        best_sim_forward[i:j][mask] = val[mask]
                        trg_indices_forward[i:j][mask] = ind[mask]
                    if args.direction in ('backward', 'union'):
                        ind = sim.argmax(axis=0)
                        val = sim[ind, xp.arange(sim.shape[1])]
                        ind += i
                        mask = (val > best_sim_backward[k:l])
                        best_sim_backward[k:l][mask] = val[mask]
                        src_indices_backward[k:l][mask] = ind[mask]
            if args.direction == 'forward':
                src_indices = src_indices_forward
                trg_indices = trg_indices_forward
            elif args.direction == 'backward':
                src_indices = src_indices_backward
                trg_indices = trg_indices_backward
            elif args.direction == 'union':
                src_indices = xp.concatenate(
                    (src_indices_forward, src_indices_backward))
                trg_indices = xp.concatenate(
                    (trg_indices_forward, trg_indices_backward))

            # Objective function evaluation
            prev_objective = objective
            if args.direction == 'forward':
                objective = xp.mean(best_sim_forward).tolist()
            elif args.direction == 'backward':
                objective = xp.mean(best_sim_backward).tolist()
            elif args.direction == 'union':
                objective = (xp.mean(best_sim_forward) +
                             xp.mean(best_sim_backward)).tolist() / 2

            # Accuracy and similarity evaluation in validation
            if args.validation is not None:
                src = list(validation.keys())
                sim = xw[src].dot(zw.T)  # TODO Assuming that it fits in memory
                nn = asnumpy(sim.argmax(axis=1))
                accuracy = np.mean([
                    1 if nn[i] in validation[src[i]] else 0
                    for i in range(len(src))
                ])
                similarity = np.mean([
                    max([sim[i, j].tolist() for j in validation[src[i]]])
                    for i in range(len(src))
                ])

            # Logging
            duration = time.time() - t
            if args.verbose:
                print(file=sys.stderr)
                print('ITERATION {0} ({1:.2f}s)'.format(it, duration),
                      file=sys.stderr)
                print('\t- Objective:        {0:9.4f}%'.format(100 *
                                                               objective),
                      file=sys.stderr)
                if args.validation is not None:
                    print('\t- Val. similarity:  {0:9.4f}%'.format(100 *
                                                                   similarity),
                          file=sys.stderr)
                    print('\t- Val. accuracy:    {0:9.4f}%'.format(100 *
                                                                   accuracy),
                          file=sys.stderr)
                    print('\t- Val. coverage:    {0:9.4f}%'.format(
                        100 * validation_coverage),
                          file=sys.stderr)
                sys.stderr.flush()
            if args.log is not None:
                val = '{0:.6f}\t{1:.6f}\t{2:.6f}'.format(
                    100 * similarity, 100 * accuracy, 100 *
                    validation_coverage) if args.validation is not None else ''
                print('{0}\t{1:.6f}\t{2}\t{3:.6f}'.format(
                    it, 100 * objective, val, duration),
                      file=log)
                log.flush()

        t = time.time()
        it += 1

    # Write mapped embeddings
    srcfile = open(args.src_output,
                   mode='w',
                   encoding=args.encoding,
                   errors='surrogateescape')
    trgfile = open(args.trg_output,
                   mode='w',
                   encoding=args.encoding,
                   errors='surrogateescape')
    embeddings.write(src_words, xw, srcfile)
    embeddings.write(trg_words, zw, trgfile)
    srcfile.close()
    trgfile.close()
示例#12
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(
        description='Evaluate embeddings in word analogy')
    parser.add_argument('embeddings', help='the word embeddings')
    parser.add_argument(
        '-t',
        '--threshold',
        type=int,
        default=0,
        help=
        'reduce vocabulary of the model for fast approximate evaluation (0 = off, otherwise typical value is 30,000)'
    )
    parser.add_argument('-i',
                        '--input',
                        default=sys.stdin.fileno(),
                        help='the test file (defaults to stdin)')
    parser.add_argument('-v',
                        '--verbose',
                        action='store_true',
                        help='verbose output (give category specific results)')
    parser.add_argument('-l',
                        '--lowercase',
                        action='store_true',
                        help='lowercase the words in the test file')
    parser.add_argument(
        '--encoding',
        default='utf-8',
        action='store_true',
        help='the character encoding for input/output (defaults to utf-8)')
    args = parser.parse_args()

    # Read input embeddings
    f = open(args.embeddings, encoding=args.encoding, errors='surrogateescape')
    words, matrix = embeddings.read(f, threshold=args.threshold)

    # Length normalize embeddings
    matrix = embeddings.length_normalize(matrix)

    # Build word to index map
    word2ind = {word: i for i, word in enumerate(words)}

    # Compute accuracy and coverage and print results
    category = category_name = None
    semantic = {'correct': 0, 'total': 0, 'oov': 0}
    syntactic = {'correct': 0, 'total': 0, 'oov': 0}
    f = open(args.input, encoding=args.encoding, errors='surrogateescape')
    for line in f:
        if line.startswith(': '):
            if args.verbose and category is not None:
                print('Coverage:{0:7.2%}  Accuracy:{1:7.2%} | {2}'.format(
                    category['total'] / (category['total'] + category['oov']),
                    category['correct'] / category['total'], category_name))
            category_name = line[2:-1]
            current = syntactic if category_name.startswith(
                'gram') else semantic
            category = {'correct': 0, 'total': 0, 'oov': 0}
        else:
            try:
                src1, trg1, src2, trg2 = [
                    word2ind[word.lower() if args.lowercase else word]
                    for word in line.split()
                ]
                similarities = np.dot(
                    matrix, matrix[src2] - matrix[src1] + matrix[trg1])
                similarities[[src1, trg1, src2]] = -1
                closest = np.argmax(similarities)
                if closest == trg2:
                    category['correct'] += 1
                    current['correct'] += 1
                category['total'] += 1
                current['total'] += 1
            except KeyError:
                category['oov'] += 1
                current['oov'] += 1
    if args.verbose:
        print('Coverage:{0:7.2%}  Accuracy:{1:7.2%} | {2}'.format(
            category['total'] / (category['total'] + category['oov']),
            category['correct'] / category['total'], category_name))
        print('-' * 80)
    print('Coverage:{0:7.2%}  Accuracy:{1:7.2%} (sem:{2:7.2%}, syn:{3:7.2%})'.
          format((semantic['total'] + syntactic['total']) /
                 (semantic['total'] + syntactic['total'] + semantic['oov'] +
                  syntactic['oov']),
                 (semantic['correct'] + syntactic['correct']) /
                 (semantic['total'] + syntactic['total']),
                 semantic['correct'] / semantic['total'],
                 syntactic['correct'] / syntactic['total']))
示例#13
0
def evaluate(src_emb_fname,
             tgt_emb_fname,
             dict_fname,
             max_voc=0,
             retrieval_method="csls",
             csls_k=10,
             batch_size=2500):

    print('Loading train data...')

    srcfile = open(src_emb_fname,
                   'r',
                   encoding='utf-8',
                   errors='surrogateescape')
    tgtfile = open(tgt_emb_fname,
                   'r',
                   encoding='utf-8',
                   errors='surrogateescape')

    # Read source embeddings
    src_words, x = embeddings.read(srcfile, max_voc=max_voc, dtype='float32')
    src_word2ind = {word: i for i, word in enumerate(src_words)}

    # Read target embeddings
    tgt_words, z = embeddings.read(tgtfile, max_voc=max_voc, dtype='float32')
    tgt_word2ind = {word: i for i, word in enumerate(tgt_words)}

    srcfile.close()
    tgtfile.close()

    xw = embeddings.length_normalize(x)
    zw = embeddings.length_normalize(z)

    # Loading test dictionary
    f = open(dict_fname, encoding='utf-8', errors='surrogateescape')
    src2trg = collections.defaultdict(set)
    trg2src = collections.defaultdict(set)
    oov = set()
    vocab = set()
    for line in f:
        src, trg = line.split()
        if args.max_vocab:
            src = src.lower()
            trg = trg.lower()
        try:
            src_ind = src_word2ind[src]
            trg_ind = trg_word2ind[trg]
            src2trg[src_ind].add(trg_ind)
            trg2src[trg_ind].add(src_ind)
            vocab.add(src)
        except KeyError:
            oov.add(src)
    src = list(src2trg.keys())
    trgt = list(trg2src.keys())

    oov -= vocab  # If one of the translation options is in the vocabulary, then the entry is not an oov
    coverage = len(src2trg) / (len(src2trg) + len(oov))
    f.close()

    ### get translations
    translation = collections.defaultdict(int)
    translation5 = collections.defaultdict(list)
    translation10 = collections.defaultdict(list)

    if retrieval_method == 'nn':  # Standard nearest neighbor
        for i in range(0, len(src), batch_size):
            j = min(i + batch_size, len(src))
            similarities = xw[src[i:j]].dot(zw.T)
            nn = similarities.argmax(axis=1).tolist()
            similarities_idx = similarities.argsort(axis=1)
            nn5 = similarities_idx[:, -5:]
            nn10 = similarities_idx[:, -10:]

            for k in range(j - i):
                translation[src[i + k]] = nn[k]
                translation5[src[i + k]] = nn5[k]
                translation10[src[i + k]] = nn10[k]

    elif retrieval_method == 'csls':
        t = time.time()
        nbrhood_x = np.zeros(xw.shape[0])
        nbrhood_z = np.zeros(zw.shape[0])
        nbrhood_z2 = cp.zeros(zw.shape[0])
        print('Computing X Neighbourhood')
        # batch_size=1000
        for i in range(0, len(src), batch_size):
            j = min(i + batch_size, len(src))
            similarities = xw[src[i:j]].dot(zw.T)
            # similarities_x = np.sort(similarities, axis=1)
            similarities_x = -1 * np.partition(
                -1 * similarities, csls_k - 1, axis=1)
            #similarities_x = -1*cp.partition(-1*cp.dot(cp.asarray(xw[src[i:j]]),cp.transpose(cp.asarray(zw))),csls_k-1 ,axis=1)[:,:csls_k]
            nbrhood_x[src[i:j]] = np.mean(similarities_x[:, :csls_k], axis=1)
        print('Completed in {0} seconds'.format(time.time() - t))
        print('Computing Z Neighbourhood')

        batch_num = 1
        for i in range(0, zw.shape[0], batch_size):
            j = min(i + batch_size, zw.shape[0])
            similarities = -1 * cp.partition(
                -1 * cp.dot(cp.asarray(zw[i:j]), cp.transpose(cp.asarray(xw))),
                csls_k - 1,
                axis=1)[:, :csls_k]
            nbrhood_z2[i:j] = (cp.mean(similarities[:, :csls_k], axis=1))
            print('Completed batch {0} in {1}'.format(batch_num,
                                                      time.time() - t))
            batch_num += 1
        # gc.collect()
        # t=time.time()
        nbrhood_z = cp.asnumpy(nbrhood_z2)
        # ipdb.set_trace()
        print(time.time() - t)
        csls_alpha = 1
        for i in range(0, len(src), batch_size):
            j = min(i + batch_size, len(src))
            similarities = xw[src[i:j]].dot(zw.T)
            similarities = np.transpose(
                np.transpose(2 * similarities) -
                csls_alpha * nbrhood_x[src[i:j]]) - csls_alpha * nbrhood_z
            nn = similarities.argmax(axis=1).tolist()
            print(time.time() - t)
            similarities = np.argsort((similarities), axis=1)

            nn5 = (similarities[:, -5:])
            nn10 = (similarities[:, -10:])
            for k in range(j - i):
                translation[src[i + k]] = nn[k]
                translation5[src[i + k]] = nn5[k]
                translation10[src[i + k]] = nn10[k]
        print('Completed in {0} seconds'.format(time.time() - t))

    #### write the translations (1 pair per line format)
    #with open(trans_tgt_fname, 'w', encoding='utf-8', errors='surrogateescape') as trans_tgt_file:
    #    for w in trans_words:
    #        trans=''
    #        if w in src_word2ind:
    #            trans=tgt_words[translation[src_word2ind[w]]]
    #        trans_tgt_file.write('{}\t{}\n'.format(w,trans))

    # evaluation metrics
    accuracy = np.mean([1 if translation[i] in src2trg[i] else 0 for i in src])
    mean = 0
    for i in src:
        for k in translation5[i]:
            if k in src2trg[i]:
                mean += 1
                break

    mean /= len(src)
    accuracy5 = mean

    mean = 0
    for i in src:
        for k in translation10[i]:
            if k in src2trg[i]:
                mean += 1
                break

    mean /= len(src)
    accuracy10 = mean
    print(
        'Coverage:{0:7.2%}  Accuracy:{1:7.2%}  Accuracy(Top 5):{2:7.2%}  Accuracy(Top 10):{3:7.2%}'
        .format(coverage, accuracy, accuracy5, accuracy10))
示例#14
0
            if i % 5 == 4:
                print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 5),end='\n')
                #print('Cos loss: {}, reconstruction_loss:{}'.format(l1, l3))
                print('Cos loss: {}, regluar_loss:{}, reconstruction_loss:{}'.format(l1,l2,l3))
                running_loss = 0.0
                # print(net.view1_fc.weight.grad)

    # print(net.view1_AE.encode_layer_0.weight.data)
    print('Cos loss: {}, regluar_loss:{}, reconstruction_loss:{}'.format(l1, l2, l3))

    source_file = open('new_embedding_size640.en', encoding='utf-8', errors='surrogateescape')
    target_file = open('new_embedding_size640.de', encoding='utf-8', errors='surrogateescape')
    en_words, en_vec = embeddings.read(source_file)
    de_words, de_vec = embeddings.read(target_file)

    en_vec = embeddings.length_normalize(en_vec)
    de_vec = embeddings.length_normalize(de_vec)

    input_view1, input_view2 = Variable(torch.from_numpy(en_vec).cuda()), Variable(torch.from_numpy(de_vec).cuda())

    res_envec, x1, res_devec, x2 = net(input_view1.float(), input_view2.float())
    print(x1)

    src_file = open('BiAE.en', mode='w', encoding='utf-8', errors='surrogateescape')
    trg_file = open('BiAE.de', mode='w', encoding='utf-8', errors='surrogateescape')

    # res_envec = embeddings.length_normalize(res_envec.data.cpu().numpy())
    # res_devec = embeddings.length_normalize(res_devec.data.cpu().numpy())

    res_envec = (res_envec.data.cpu().numpy())
    res_devec = (res_devec.data.cpu().numpy())
示例#15
0
def translate_topn(words_to_translate,
                   src_emb_info,
                   tgt_emb_info,
                   retrieval_method="csls",
                   topn=5,
                   csls_k=10,
                   batch_size=2500):
    """
    The top-n are not necessarily sorted, but the scores can be used to retrieve the sorted top-k candidates
    Only the 'csls' search implementation is complete
    """

    # Read source embeddings
    src_words, x = src_emb_info
    src_word2ind = build_w2i(src_words)

    # Read target embeddings
    tgt_words, z = tgt_emb_info
    tgt_word2ind = build_w2i(tgt_words)

    xw = embeddings.length_normalize(x)
    zw = embeddings.length_normalize(z)

    all_words = []
    trans_words = []
    trans_idx = []
    oov = set()

    for w in words_to_translate:
        try:
            all_words.append(w)
            w_ind = src_word2ind[w]
            trans_words.append(w)
            trans_idx.append(w_ind)
        except KeyError:
            oov.add(w)

    print(len(all_words))
    print(len(trans_words))
    print(len(trans_idx))
    print(len(oov))
    src = trans_idx

    translation_topn = collections.defaultdict(list)
    translation_topn_prob = collections.defaultdict(list)

    if retrieval_method == 'nn':  # Standard nearest neighbor
        for i in range(0, len(src), batch_size):
            j = min(i + batch_size, len(src))
            similarities = xw[src[i:j]].dot(zw.T)
            similarities_idx = similarities.argsort(axis=1)
            similarities_scores = np.sort(similarities, axis=1)
            nn_topn = similarities_idx[:, -topn:]
            sim_unnorm = np.exp(similarities_scores[:, -topn:])
            sim_total = np.sum(sim_unnorm, axis=1).reshape(
                (sim_unnorm.shape[0],
                 1))  # sim_unnorm has same first dimension as sim_total
            nn_topn_logprob = np.log(sim_unnorm /
                                     sim_total)  ## softmax log probabilities

            for k in range(j - i):
                translation_topn[src[i + k]] = nn_topn[k]
                translation_topn_logprob[src[i + k]] = nn_topn_logprob[k]

    elif retrieval_method == 'csls':
        t = time.time()
        nbrhood_x = np.zeros(xw.shape[0])
        nbrhood_z = np.zeros(zw.shape[0])
        nbrhood_z2 = cp.zeros(zw.shape[0])

        print('Computing X Neighbourhood')
        # batch_size=1000
        batch_num = 1
        for i in range(0, len(src), batch_size):
            j = min(i + batch_size, len(src))
            similarities = xw[src[i:j]].dot(zw.T)
            # similarities_x = np.sort(similarities, axis=1)
            similarities_x = -1 * np.partition(
                -1 * similarities, csls_k - 1, axis=1)
            #similarities_x = -1*cp.partition(-1*cp.dot(cp.asarray(xw[src[i:j]]),cp.transpose(cp.asarray(zw))),csls_k-1 ,axis=1)[:,:csls_k]
            nbrhood_x[src[i:j]] = np.mean(similarities_x[:, :csls_k], axis=1)
            print('Completed batch {0} in {1}'.format(batch_num,
                                                      time.time() - t))
            batch_num += 1
        print('Completed in {0} seconds'.format(time.time() - t))

        print('Computing Z Neighbourhood')

        batch_num = 1
        for i in range(0, zw.shape[0], batch_size):
            j = min(i + batch_size, zw.shape[0])
            similarities = -1 * cp.partition(
                -1 * cp.dot(cp.asarray(zw[i:j]), cp.transpose(cp.asarray(xw))),
                csls_k - 1,
                axis=1)[:, :csls_k]
            nbrhood_z2[i:j] = (cp.mean(similarities[:, :csls_k], axis=1))
            print('Completed batch {0} in {1}'.format(batch_num,
                                                      time.time() - t))
            batch_num += 1
        # gc.collect()
        # t=time.time()
        nbrhood_z = cp.asnumpy(nbrhood_z2)
        # ipdb.set_trace()
        print(time.time() - t)
        csls_alpha = 1
        for i in range(0, len(src), batch_size):
            j = min(i + batch_size, len(src))
            similarities = xw[src[i:j]].dot(zw.T)
            similarities = np.transpose(np.transpose(2*similarities) - \
                                csls_alpha*nbrhood_x[src[i:j]])- \
                                csls_alpha*nbrhood_z

            similarities_idx = -1 * np.argpartition(
                -1 * similarities, topn - 1, axis=1)
            nn_topn = similarities_idx[:, -topn:]
            row_x = np.tile(np.array(range(topn)),
                            (similarities_idx.shape[0], 1))
            print('Shapes')
            print(similarities.shape)
            print(similarities_idx.shape)
            similarities_scores = similarities[row_x, nn_topn]
            sim_unnorm = np.exp(similarities_scores)

            #             similarities_idx = similarities.argsort(axis=1)
            #             similarities_scores = np.sort(similarities,axis=1)
            #             print(time.time()-t)
            #             nn_topn = similarities_idx[:,-topn:]
            #             sim_unnorm = np.exp(similarities_scores[:,-topn:])

            sim_total = np.sum(sim_unnorm, axis=1).reshape(
                (sim_unnorm.shape[0],
                 1))  # sim_unnorm has same first dimension as sim_total
            #             nn_topn_logprob=np.log(sim_unnorm/sim_total)  ## softmax log probabilities
            nn_topn_prob = sim_unnorm / sim_total  ## softmax log probabilities

            for k in range(j - i):
                translation_topn[src[i + k]] = nn_topn[k]
                translation_topn_prob[src[i + k]] = nn_topn_prob[k]

        print('Completed in {0} seconds'.format(time.time() - t))

    # get translations
    trans_pairs = []
    for w in trans_words:
        if w in src_word2ind:
            srcid = src_word2ind[w]
            trans = [(tgt_words[translation_topn[srcid][r]],
                      translation_topn_prob[srcid][r]) for r in range(topn)]
            trans_pairs.append((w, trans))

    return dict(trans_pairs)
示例#16
0
            if i % 5 == 4:
                print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1, running_loss / 5))
                running_loss = 0.0
                # print(net.view1_fc.weight.grad)

    source_file = open('new_embedding_size200.en',
                       encoding='utf-8',
                       errors='surrogateescape')
    target_file = open('new_embedding_size200.de',
                       encoding='utf-8',
                       errors='surrogateescape')
    en_words, en_vec = embeddings.read(source_file)
    de_words, de_vec = embeddings.read(target_file)

    en_vec = embeddings.length_normalize(en_vec)
    de_vec = embeddings.length_normalize(de_vec)

    input_view1, input_view2 = Variable(
        torch.from_numpy(en_vec).cuda()), Variable(
            torch.from_numpy(de_vec).cuda())

    res_envec = net(input_view1.float())

    src_file = open('LinearMappingres.en',
                    mode='w',
                    encoding='utf-8',
                    errors='surrogateescape')
    trg_file = open('LinearMappingres.de',
                    mode='w',
                    encoding='utf-8',
示例#17
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(
        description='Map the source embeddings into the target embedding space'
    )
    parser.add_argument('src_input', help='the input source embeddings')
    parser.add_argument('mid_input', help='the input pivot embeddings')
    parser.add_argument('trg_input', help='the input target embeddings')
    parser.add_argument(
        '--encoding',
        default='utf-8',
        help='the character encoding for input/output (defaults to utf-8)')
    parser.add_argument(
        '--max_vocab',
        default=0,
        type=int,
        help='Maximum vocabulary to be loaded, 0 allows complete vocabulary')
    parser.add_argument('--verbose', default=0, type=int, help='Verbose')
    mapping_group = parser.add_argument_group(
        'mapping arguments', 'Basic embedding mapping arguments')
    mapping_group.add_argument(
        '-dtrain1',
        '--dictionary_train1',
        default=sys.stdin.fileno(),
        help='the first training dictionary file (defaults to stdin)')
    mapping_group.add_argument(
        '-dtrain2',
        '--dictionary_train2',
        default=sys.stdin.fileno(),
        help='the second training dictionary file (defaults to stdin)')
    mapping_group.add_argument(
        '-dtest',
        '--dictionary_test',
        default=sys.stdin.fileno(),
        help='the test dictionary file (defaults to stdin)')
    mapping_group.add_argument(
        '--normalize',
        choices=['unit', 'center', 'unitdim', 'centeremb'],
        nargs='*',
        default=[],
        help='the normalization actions to perform in order')

    geomm_group = parser.add_argument_group('GeoMM arguments',
                                            'Arguments for GeoMM method')
    geomm_group.add_argument('--l2_reg',
                             type=float,
                             default=1e2,
                             help='Lambda for L2 Regularization')
    geomm_group.add_argument(
        '--max_opt_time',
        type=int,
        default=5000,
        help='Maximum time limit for optimization in seconds')
    geomm_group.add_argument(
        '--max_opt_iter',
        type=int,
        default=150,
        help='Maximum number of iterations for optimization')

    eval_group = parser.add_argument_group('evaluation arguments',
                                           'Arguments for evaluation')
    eval_group.add_argument('--normalize_eval',
                            action='store_true',
                            help='Normalize the embeddings at test time')
    eval_group.add_argument('--eval_batch_size',
                            type=int,
                            default=1000,
                            help='Batch size for evaluation')
    eval_group.add_argument('--csls_neighbourhood',
                            type=int,
                            default=10,
                            help='Neighbourhood size for CSLS')

    args = parser.parse_args()
    BATCH_SIZE = args.eval_batch_size

    # Logging
    method_name = os.path.join('logs', 'geomm_cmp_pip')
    directory = os.path.join(
        os.path.join(os.getcwd(), method_name),
        datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
    if not os.path.exists(directory):
        os.makedirs(directory)
    log_file_name, file_extension = os.path.splitext(
        os.path.basename(args.dictionary_test))
    log_file_name = log_file_name + '.log'

    class Logger(object):
        def __init__(self):
            self.terminal = sys.stdout
            self.log = open(os.path.join(directory, log_file_name), "a")

        def write(self, message):
            self.terminal.write(message)
            self.log.write(message)

        def flush(self):
            #this flush method is needed for python 3 compatibility.
            #this handles the flush command by doing nothing.
            #you might want to specify some extra behavior here.
            pass

    sys.stdout = Logger()
    if args.verbose:
        print('Current arguments: {0}'.format(args))

    dtype = 'float32'

    if args.verbose:
        print('Loading train data...')
    # Read input embeddings
    srcfile = open(args.src_input,
                   encoding=args.encoding,
                   errors='surrogateescape')
    midfile = open(args.mid_input,
                   encoding=args.encoding,
                   errors='surrogateescape')
    trgfile = open(args.trg_input,
                   encoding=args.encoding,
                   errors='surrogateescape')

    src_words, x = embeddings.read(srcfile,
                                   max_voc=args.max_vocab,
                                   dtype=dtype)
    mid_words, y = embeddings.read(midfile,
                                   max_voc=args.max_vocab,
                                   dtype=dtype)
    trg_words, z = embeddings.read(trgfile,
                                   max_voc=args.max_vocab,
                                   dtype=dtype)

    # Build word to index map
    src_word2ind = {word: i for i, word in enumerate(src_words)}
    mid_word2ind = {word: i for i, word in enumerate(mid_words)}
    trg_word2ind = {word: i for i, word in enumerate(trg_words)}

    # Build training dictionary-1
    src_indices12 = []
    trg_indices12 = []
    f = open(args.dictionary_train1,
             encoding=args.encoding,
             errors='surrogateescape')
    for line in f:
        src, trg = line.split()
        if args.max_vocab:
            src = src.lower()
            trg = trg.lower()
        try:
            src_ind = src_word2ind[src]
            trg_ind = mid_word2ind[trg]
            src_indices12.append(src_ind)
            trg_indices12.append(trg_ind)
        except KeyError:
            if args.verbose:
                print('WARNING: OOV dictionary entry ({0} - {1})'.format(
                    src, trg),
                      file=sys.stderr)
    f.close()

    # Build training dictionary-2
    src_indices23 = []
    trg_indices23 = []
    f = open(args.dictionary_train2,
             encoding=args.encoding,
             errors='surrogateescape')
    for line in f:
        src, trg = line.split()
        if args.max_vocab:
            src = src.lower()
            trg = trg.lower()
        try:
            src_ind = mid_word2ind[src]
            trg_ind = trg_word2ind[trg]
            src_indices23.append(src_ind)
            trg_indices23.append(trg_ind)
        except KeyError:
            if args.verbose:
                print('WARNING: OOV dictionary entry ({0} - {1})'.format(
                    src, trg),
                      file=sys.stderr)
    f.close()

    if args.verbose:
        print('Normalizing embeddings...')
    # STEP 0: Normalization
    for action in args.normalize:
        if action == 'unit':
            x = embeddings.length_normalize(x)
            y = embeddings.length_normalize(y)
            z = embeddings.length_normalize(z)
        elif action == 'center':
            x = embeddings.mean_center(x)
            y = embeddings.mean_center(y)
            z = embeddings.mean_center(z)
        elif action == 'unitdim':
            x = embeddings.length_normalize_dimensionwise(x)
            y = embeddings.length_normalize_dimensionwise(y)
            z = embeddings.length_normalize_dimensionwise(z)
        elif action == 'centeremb':
            x = embeddings.mean_center_embeddingwise(x)
            y = embeddings.mean_center_embeddingwise(y)
            z = embeddings.mean_center_embeddingwise(z)

    # Step 1.1: Optimization-1
    if args.verbose:
        print('Beginning Optimization-1')
    start_time = time.time()

    x_count = len(set(src_indices12))
    y_count = len(set(trg_indices12))
    A = np.zeros((x_count, y_count))

    # Creating dictionary matrix from training set
    map_dict_src = {}
    map_dict_trg = {}
    I = 0
    uniq_src = []
    uniq_trg = []
    for i in range(len(src_indices12)):
        if src_indices12[i] not in map_dict_src.keys():
            map_dict_src[src_indices12[i]] = I
            I += 1
            uniq_src.append(src_indices12[i])
    J = 0
    for j in range(len(trg_indices12)):
        if trg_indices12[j] not in map_dict_trg.keys():
            map_dict_trg[trg_indices12[j]] = J
            J += 1
            uniq_trg.append(trg_indices12[j])

    for i in range(len(src_indices12)):
        A[map_dict_src[src_indices12[i]], map_dict_trg[trg_indices12[i]]] = 1

    np.random.seed(0)
    Lambda = args.l2_reg
    U1 = TT.matrix()
    U2 = TT.matrix()
    B = TT.matrix()
    cost = TT.sum(((shared(x[uniq_src]).dot(U1.dot(B.dot(U2.T)))).dot(
        shared(y[uniq_trg]).T) - A)**2) + 0.5 * Lambda * (TT.sum(B**2))

    solver = ConjugateGradient(maxtime=args.max_opt_time,
                               maxiter=args.max_opt_iter)

    low_rank = 300
    manifold = Product([
        Stiefel(x.shape[1], low_rank),
        Stiefel(y.shape[1], low_rank),
        PositiveDefinite(low_rank)
    ])
    problem = Problem(manifold=manifold,
                      cost=cost,
                      arg=[U1, U2, B],
                      verbosity=3)
    wopt = solver.solve(problem)

    w = wopt
    U1 = w[0]
    U2 = w[1]
    B = w[2]
    w12 = U1.dot(B).dot(U2.T)
    u11 = U1
    u21 = U2
    b1 = B

    # Step 1.2: Optimization-2
    if args.verbose:
        print('Beginning Optimization-2')
    y_count = len(set(src_indices23))
    z_count = len(set(trg_indices23))
    A = np.zeros((y_count, z_count))

    # Creating dictionary matrix from training set
    map_dict_src = {}
    map_dict_trg = {}
    I = 0
    uniq_src = []
    uniq_trg = []
    for i in range(len(src_indices23)):
        if src_indices23[i] not in map_dict_src.keys():
            map_dict_src[src_indices23[i]] = I
            I += 1
            uniq_src.append(src_indices23[i])
    J = 0
    for j in range(len(trg_indices23)):
        if trg_indices23[j] not in map_dict_trg.keys():
            map_dict_trg[trg_indices23[j]] = J
            J += 1
            uniq_trg.append(trg_indices23[j])

    for i in range(len(src_indices23)):
        A[map_dict_src[src_indices23[i]], map_dict_trg[trg_indices23[i]]] = 1

    np.random.seed(0)
    U1 = TT.matrix()
    U2 = TT.matrix()
    B = TT.matrix()
    cost = TT.sum(((shared(y[uniq_src]).dot(U1.dot(B.dot(U2.T)))).dot(
        shared(z[uniq_trg]).T) - A)**2) + 0.5 * Lambda * (TT.sum(B**2))
    solver = ConjugateGradient(maxtime=args.max_opt_time,
                               maxiter=args.max_opt_iter)

    low_rank = 300
    manifold = Product([
        Stiefel(y.shape[1], low_rank),
        Stiefel(z.shape[1], low_rank),
        PositiveDefinite(low_rank)
    ])
    problem = Problem(manifold=manifold,
                      cost=cost,
                      arg=[U1, U2, B],
                      verbosity=3)
    wopt = solver.solve(problem)

    w = wopt
    U1 = w[0]
    U2 = w[1]
    B = w[2]
    w23 = U1.dot(B).dot(U2.T)
    u22 = U1
    u32 = U2
    b2 = B

    # Step 2: Transformation
    w12_1 = u11.dot(scipy.linalg.sqrtm(b1))
    w12_2 = u21.dot(scipy.linalg.sqrtm(b1))
    w23_1 = u22.dot(scipy.linalg.sqrtm(b2))
    w23_2 = u32.dot(scipy.linalg.sqrtm(b2))

    end_time = time.time()
    if args.verbose:
        print('Completed training in {0:.2f} seconds'.format(end_time -
                                                             start_time))
    gc.collect()

    # Step 3: Evaluation
    # Loading test dictionary
    f = open(args.dictionary_test,
             encoding=args.encoding,
             errors='surrogateescape')
    src2trg = collections.defaultdict(set)
    trg2src = collections.defaultdict(set)
    oov = set()
    vocab = set()
    for line in f:
        src, trg = line.split()
        if args.max_vocab:
            src = src.lower()
            trg = trg.lower()
        try:
            src_ind = src_word2ind[src]
            trg_ind = trg_word2ind[trg]
            src2trg[src_ind].add(trg_ind)
            trg2src[trg_ind].add(src_ind)
            vocab.add(src)
        except KeyError:
            oov.add(src)
    src = list(src2trg.keys())
    trgt = list(trg2src.keys())

    oov -= vocab  # If one of the translation options is in the vocabulary, then the entry is not an oov
    coverage = len(src2trg) / (len(src2trg) + len(oov))
    f.close()

    # Composition (CMP)
    xw = x.dot(w12).dot(w23)
    zw = z
    if args.normalize_eval:
        xw = embeddings.length_normalize(xw)
        zw = embeddings.length_normalize(zw)

    translation = collections.defaultdict(int)
    translation5 = collections.defaultdict(list)
    translation10 = collections.defaultdict(list)

    t = time.time()
    nbrhood_x = np.zeros(xw.shape[0])
    nbrhood_z = np.zeros(zw.shape[0])
    nbrhood_z2 = cp.zeros(zw.shape[0])
    for i in range(0, len(src), BATCH_SIZE):
        j = min(i + BATCH_SIZE, len(src))
        similarities = xw[src[i:j]].dot(zw.T)
        similarities_x = -1 * np.partition(
            -1 * similarities, args.csls_neighbourhood - 1, axis=1)
        nbrhood_x[src[i:j]] = np.mean(
            similarities_x[:, :args.csls_neighbourhood], axis=1)

    batch_num = 1
    for i in range(0, zw.shape[0], BATCH_SIZE):
        j = min(i + BATCH_SIZE, zw.shape[0])
        similarities = -1 * cp.partition(
            -1 * cp.dot(cp.asarray(zw[i:j]), cp.transpose(cp.asarray(xw))),
            args.csls_neighbourhood - 1,
            axis=1)[:, :args.csls_neighbourhood]
        nbrhood_z2[i:j] = (cp.mean(similarities[:, :args.csls_neighbourhood],
                                   axis=1))
        batch_num += 1
    nbrhood_z = cp.asnumpy(nbrhood_z2)
    for i in range(0, len(src), BATCH_SIZE):
        j = min(i + BATCH_SIZE, len(src))
        similarities = xw[src[i:j]].dot(zw.T)
        similarities = np.transpose(
            np.transpose(2 * similarities) - nbrhood_x[src[i:j]]) - nbrhood_z
        nn = similarities.argmax(axis=1).tolist()
        similarities = np.argsort((similarities), axis=1)

        nn5 = (similarities[:, -5:])
        nn10 = (similarities[:, -10:])
        for k in range(j - i):
            translation[src[i + k]] = nn[k]
            translation5[src[i + k]] = nn5[k]
            translation10[src[i + k]] = nn10[k]
    accuracy = np.mean([1 if translation[i] in src2trg[i] else 0 for i in src])
    mean = 0
    for i in src:
        for k in translation5[i]:
            if k in src2trg[i]:
                mean += 1
                break

    mean /= len(src)
    accuracy5 = mean

    mean = 0
    for i in src:
        for k in translation10[i]:
            if k in src2trg[i]:
                mean += 1
                break

    mean /= len(src)
    accuracy10 = mean
    print(
        'CMP: Coverage:{0:7.2%}  Accuracy:{1:7.2%}  Accuracy(Top 5):{2:7.2%}  Accuracy(Top 10):{3:7.2%}'
        .format(coverage, accuracy, accuracy5, accuracy10))

    # Pipeline (PIP)
    xw = x.dot(w12_1)
    zw = y.dot(w12_2)
    if args.normalize_eval:
        xw = embeddings.length_normalize(xw)
        zw = embeddings.length_normalize(zw)

    translation12 = collections.defaultdict(int)
    # PIP-Stage 1
    t = time.time()
    nbrhood_x = np.zeros(xw.shape[0])
    nbrhood_z = np.zeros(zw.shape[0])
    nbrhood_z2 = cp.zeros(zw.shape[0])
    for i in range(0, len(src), BATCH_SIZE):
        j = min(i + BATCH_SIZE, len(src))
        similarities = xw[src[i:j]].dot(zw.T)
        similarities_x = -1 * np.partition(
            -1 * similarities, args.csls_neighbourhood - 1, axis=1)
        nbrhood_x[src[i:j]] = np.mean(
            similarities_x[:, :args.csls_neighbourhood], axis=1)

    batch_num = 1
    for i in range(0, zw.shape[0], BATCH_SIZE):
        j = min(i + BATCH_SIZE, zw.shape[0])
        similarities = -1 * cp.partition(
            -1 * cp.dot(cp.asarray(zw[i:j]), cp.transpose(cp.asarray(xw))),
            args.csls_neighbourhood - 1,
            axis=1)[:, :args.csls_neighbourhood]
        nbrhood_z2[i:j] = (cp.mean(similarities[:, :args.csls_neighbourhood],
                                   axis=1))
        batch_num += 1
    nbrhood_z = cp.asnumpy(nbrhood_z2)
    for i in range(0, len(src), BATCH_SIZE):
        j = min(i + BATCH_SIZE, len(src))
        similarities = xw[src[i:j]].dot(zw.T)
        similarities = np.transpose(
            np.transpose(2 * similarities) - nbrhood_x[src[i:j]]) - nbrhood_z
        nn = similarities.argmax(axis=1).tolist()
        for k in range(j - i):
            translation[src[i + k]] = nn[k]

    # PIP-Stage 2
    mid = [translation[sr] for sr in src]
    xw = y.dot(w23_1)
    zw = z.dot(w23_2)
    if args.normalize_eval:
        xw = embeddings.length_normalize(xw)
        zw = embeddings.length_normalize(zw)

    translation = collections.defaultdict(int)
    translation5 = collections.defaultdict(list)
    translation10 = collections.defaultdict(list)

    t = time.time()
    nbrhood_x = np.zeros(xw.shape[0])
    nbrhood_z = np.zeros(zw.shape[0])
    nbrhood_z2 = cp.zeros(zw.shape[0])
    for i in range(0, len(mid), BATCH_SIZE):
        j = min(i + BATCH_SIZE, len(mid))
        similarities = xw[mid[i:j]].dot(zw.T)
        # similarities_x = np.sort(similarities, axis=1)
        similarities_x = -1 * np.partition(
            -1 * similarities, args.csls_neighbourhood - 1, axis=1)
        nbrhood_x[mid[i:j]] = np.mean(
            similarities_x[:, :args.csls_neighbourhood], axis=1)

    batch_num = 1
    for i in range(0, zw.shape[0], BATCH_SIZE):
        j = min(i + BATCH_SIZE, zw.shape[0])
        similarities = -1 * cp.partition(
            -1 * cp.dot(cp.asarray(zw[i:j]), cp.transpose(cp.asarray(xw))),
            args.csls_neighbourhood - 1,
            axis=1)[:, :args.csls_neighbourhood]
        nbrhood_z2[i:j] = (cp.mean(similarities[:, :args.csls_neighbourhood],
                                   axis=1))
        batch_num += 1
    nbrhood_z = cp.asnumpy(nbrhood_z2)
    for i in range(0, len(mid), BATCH_SIZE):
        j = min(i + BATCH_SIZE, len(mid))
        similarities = xw[mid[i:j]].dot(zw.T)
        similarities = np.transpose(
            np.transpose(2 * similarities) - nbrhood_x[mid[i:j]]) - nbrhood_z
        nn = similarities.argmax(axis=1).tolist()
        similarities = np.argsort((similarities), axis=1)

        nn5 = (similarities[:, -5:])
        nn10 = (similarities[:, -10:])
        for k in range(j - i):
            translation[src[i + k]] = nn[k]
            translation5[src[i + k]] = nn5[k]
            translation10[src[i + k]] = nn10[k]

    accuracy = np.mean([1 if translation[i] in src2trg[i] else 0 for i in src])
    mean = 0
    for i in src:
        for k in translation5[i]:
            if k in src2trg[i]:
                mean += 1
                break

    mean /= len(src)
    accuracy5 = mean

    mean = 0
    for i in src:
        for k in translation10[i]:
            if k in src2trg[i]:
                mean += 1
                break

    mean /= len(src)
    accuracy10 = mean
    print(
        'PIP: Coverage:{0:7.2%}  Accuracy:{1:7.2%}  Accuracy(Top 5):{2:7.2%}  Accuracy(Top 10):{3:7.2%}'
        .format(coverage, accuracy, accuracy5, accuracy10))
示例#18
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(
        description='Map the source embeddings into the target embedding space'
    )
    parser.add_argument('src_input', help='the input source embeddings')
    parser.add_argument('trg_input', help='the input target embeddings')
    parser.add_argument(
        '--encoding',
        default='utf-8',
        help='the character encoding for input/output (defaults to utf-8)')
    parser.add_argument(
        '--max_vocab',
        default=0,
        type=int,
        help='Maximum vocabulary to be loaded, 0 allows complete vocabulary')
    parser.add_argument('--verbose', default=0, type=int, help='Verbose')
    mapping_group = parser.add_argument_group(
        'mapping arguments', 'Basic embedding mapping arguments')
    mapping_group.add_argument(
        '-dtrain',
        '--dictionary_train',
        default=sys.stdin.fileno(),
        help='the training dictionary file (defaults to stdin)')
    mapping_group.add_argument(
        '-dtest',
        '--dictionary_test',
        default=sys.stdin.fileno(),
        help='the test dictionary file (defaults to stdin)')
    mapping_group.add_argument(
        '-dtrainspl',
        '--dictionary_trainspl',
        default=sys.stdin.fileno(),
        help='the training dictionary split file (defaults to stdin)')
    mapping_group.add_argument(
        '-dvalspl',
        '--dictionary_valspl',
        default=sys.stdin.fileno(),
        help='the validation dictionary split file (defaults to stdin)')
    mapping_group.add_argument(
        '--normalize',
        choices=['unit', 'center', 'unitdim', 'centeremb'],
        nargs='*',
        default=[],
        help='the normalization actions to perform in order')

    geomm_group = parser.add_argument_group('GeoMM arguments',
                                            'Arguments for GeoMM method')
    geomm_group.add_argument('--l2_reg',
                             type=float,
                             default=1e-1,
                             help='Lambda for L2 Regularization')
    geomm_group.add_argument(
        '--max_opt_time',
        type=int,
        default=5000,
        help='Maximum time limit for optimization in seconds')
    geomm_group.add_argument(
        '--max_opt_iter',
        type=int,
        default=150,
        help='Maximum number of iterations for optimization')
    geomm_group.add_argument(
        '--x_cutoff',
        type=int,
        default=25000,
        help='Vocabulary cutoff for first language for bootstrapping')
    geomm_group.add_argument(
        '--z_cutoff',
        type=int,
        default=25000,
        help='Vocabulary cutoff for second language for bootstrapping')
    geomm_group.add_argument(
        '--patience',
        type=int,
        default=1,
        help=
        'Number of iterations with a decrease in validation accuracy permissible during bootstrapping'
    )

    eval_group = parser.add_argument_group('evaluation arguments',
                                           'Arguments for evaluation')
    eval_group.add_argument('--normalize_eval',
                            action='store_true',
                            help='Normalize the embeddings at test time')
    eval_group.add_argument('--eval_batch_size',
                            type=int,
                            default=500,
                            help='Batch size for evaluation')
    eval_group.add_argument('--csls_neighbourhood',
                            type=int,
                            default=10,
                            help='Neighbourhood size for CSLS')

    args = parser.parse_args()
    BATCH_SIZE = args.eval_batch_size

    # Logging
    method_name = os.path.join('logs', 'geomm_semi')
    directory = os.path.join(
        os.path.join(os.getcwd(), method_name),
        datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
    if not os.path.exists(directory):
        os.makedirs(directory)
    log_file_name, file_extension = os.path.splitext(
        os.path.basename(args.dictionary_train))
    log_file_name = log_file_name + '.log'

    class Logger(object):
        def __init__(self):
            self.terminal = sys.stdout
            self.log = open(os.path.join(directory, log_file_name), "a")

        def write(self, message):
            self.terminal.write(message)
            self.log.write(message)

        def flush(self):
            #this flush method is needed for python 3 compatibility.
            #this handles the flush command by doing nothing.
            #you might want to specify some extra behavior here.
            pass

    sys.stdout = Logger()
    if args.verbose:
        print('Current arguments: {0}'.format(args))

    dtype = 'float32'

    if args.verbose:
        print('Loading train data...')
    # Read input embeddings
    srcfile = open(args.src_input,
                   encoding=args.encoding,
                   errors='surrogateescape')
    trgfile = open(args.trg_input,
                   encoding=args.encoding,
                   errors='surrogateescape')
    src_words, x = embeddings.read(srcfile,
                                   max_voc=args.max_vocab,
                                   dtype=dtype)
    trg_words, z = embeddings.read(trgfile,
                                   max_voc=args.max_vocab,
                                   dtype=dtype)

    # Build word to index map
    src_word2ind = {word: i for i, word in enumerate(src_words)}
    trg_word2ind = {word: i for i, word in enumerate(trg_words)}

    # Build training dictionary
    src_indices = []
    trg_indices = []
    f = open(args.dictionary_train,
             encoding=args.encoding,
             errors='surrogateescape')
    for line in f:
        src, trg = line.split()
        if args.max_vocab:
            src = src.lower()
            trg = trg.lower()
        try:
            src_ind = src_word2ind[src]
            trg_ind = trg_word2ind[trg]
            src_indices.append(src_ind)
            trg_indices.append(trg_ind)
        except KeyError:
            if args.verbose:
                print('WARNING: OOV dictionary entry ({0} - {1})'.format(
                    src, trg),
                      file=sys.stderr)
    f.close()
    src_indices = src_indices
    trg_indices = trg_indices
    src_indices_train = list(src_indices)
    trg_indices_train = list(trg_indices)
    src_indices = []
    trg_indices = []

    # Loading train-split dictionary
    f = open(args.dictionary_trainspl,
             encoding=args.encoding,
             errors='surrogateescape')
    for line in f:
        src, trg = line.split()
        if args.max_vocab:
            src = src.lower()
            trg = trg.lower()
        try:
            src_ind = src_word2ind[src]
            trg_ind = trg_word2ind[trg]
            src_indices.append(src_ind)
            trg_indices.append(trg_ind)
        except KeyError:
            if args.verbose:
                print('WARNING: OOV dictionary entry ({0} - {1})'.format(
                    src, trg),
                      file=sys.stderr)
    f.close()

    if args.verbose:
        print('Normalizing embeddings...')
    # STEP 0: Normalization
    for action in args.normalize:
        if action == 'unit':
            x = embeddings.length_normalize(x)
            z = embeddings.length_normalize(z)
        elif action == 'center':
            x = embeddings.mean_center(x)
            z = embeddings.mean_center(z)
        elif action == 'unitdim':
            x = embeddings.length_normalize_dimensionwise(x)
            z = embeddings.length_normalize_dimensionwise(z)
        elif action == 'centeremb':
            x = embeddings.mean_center_embeddingwise(x)
            z = embeddings.mean_center_embeddingwise(z)
    orig_src = src_indices
    orig_trg = trg_indices
    best_val_acc = 0
    best_add_src = []
    best_add_trg = []
    add_src = []
    add_trg = []

    if args.verbose:
        print('Beginning Optimization')
    start_time = time.time()
    it_count = 0
    drop_count = 0

    # Bootstrap loop
    while True:
        if args.verbose:
            print('Starting bootstrap iteration {0}'.format(it_count + 1))
        # Step 1.1: Optimization
        x_count = len(set(src_indices))
        z_count = len(set(trg_indices))

        # Creating dictionary matrix from training set
        map_dict_src = {}
        map_dict_trg = {}
        I = 0
        uniq_src = []
        uniq_trg = []
        for i in range(len(src_indices)):
            if src_indices[i] not in map_dict_src.keys():
                map_dict_src[src_indices[i]] = I
                I += 1
                uniq_src.append(src_indices[i])
        J = 0
        for j in range(len(trg_indices)):
            if trg_indices[j] not in map_dict_trg.keys():
                map_dict_trg[trg_indices[j]] = J
                J += 1
                uniq_trg.append(trg_indices[j])

        np.random.seed(0)
        Lambda = args.l2_reg
        U1 = TT.matrix()
        U2 = TT.matrix()
        B = TT.matrix()
        X_tot = x[uniq_src].T.dot(x[uniq_src])
        Z_tot = z[uniq_trg].T.dot(z[uniq_trg])
        W = U1.dot(B.dot(U2.T))
        cost = (TT.nlinalg.trace(
            U2.dot(
                B.dot(
                    U1.T.dot(
                        shared(X_tot).dot(
                            U1.dot(B.dot(U2.T.dot(shared(Z_tot))))))))) -
                2 * TT.sum(
                    (shared(x[src_indices]).dot(W)) * shared(z[trg_indices]))
                ) / (len(src_indices)) + 0.5 * Lambda * (TT.sum(B**2))
        solver = ConjugateGradient(maxtime=args.max_opt_time,
                                   maxiter=args.max_opt_iter,
                                   mingradnorm=1e-15)

        low_rank = 300
        manifold = Product([
            Stiefel(x.shape[1], low_rank),
            Stiefel(z.shape[1], low_rank),
            PositiveDefinite(low_rank)
        ])
        problem = Problem(manifold=manifold,
                          cost=cost,
                          arg=[U1, U2, B],
                          verbosity=3)
        wopt = solver.solve(problem)
        w = wopt
        U1 = w[0]
        U2 = w[1]
        B = w[2]

        # Step 1.2: Transformation
        xw = x.dot(U1).dot(scipy.linalg.sqrtm(B))
        zw = z.dot(U2).dot(scipy.linalg.sqrtm(B))

        it_count += 1

        # Step 1.3: Compute Validation Accuracy
        if args.normalize_eval:
            xw = embeddings.length_normalize(xw)
            zw = embeddings.length_normalize(zw)

        # Loading validation dictionary
        f = open(args.dictionary_valspl,
                 encoding=args.encoding,
                 errors='surrogateescape')
        src2trg = collections.defaultdict(set)
        trg2src = collections.defaultdict(set)
        oov = set()
        vocab = set()
        for line in f:
            src, trg = line.split()
            if args.max_vocab:
                src = src.lower()
                trg = trg.lower()
            try:
                src_ind = src_word2ind[src]
                trg_ind = trg_word2ind[trg]
                src2trg[src_ind].add(trg_ind)
                trg2src[trg_ind].add(src_ind)
                vocab.add(src)
            except KeyError:
                oov.add(src)
        src = list(src2trg.keys())
        trgt = list(trg2src.keys())

        oov -= vocab  # If one of the translation options is in the vocabulary, then the entry is not an oov
        coverage = len(src2trg) / (len(src2trg) + len(oov))
        f.close()

        translation = collections.defaultdict(int)
        translation5 = collections.defaultdict(list)
        translation10 = collections.defaultdict(list)

        t = time.time()
        nbrhood_x = cp.zeros(xw.shape[0])
        nbrhood_z = cp.zeros(zw.shape[0])
        for i in range(0, len(src), BATCH_SIZE):
            j = min(i + BATCH_SIZE, len(src))
            similarities = -1 * cp.partition(
                -1 *
                cp.dot(cp.asarray(xw[src[i:j]]), cp.transpose(cp.asarray(zw))),
                args.csls_neighbourhood - 1,
                axis=1)[:, :args.csls_neighbourhood]
            nbrhood_x[src[i:j]] = (cp.mean(similarities, axis=1))

        for i in range(0, zw.shape[0], BATCH_SIZE):
            j = min(i + BATCH_SIZE, zw.shape[0])
            similarities = -1 * cp.partition(
                -1 * cp.dot(cp.asarray(zw[i:j]), cp.transpose(cp.asarray(xw))),
                args.csls_neighbourhood - 1,
                axis=1)[:, :args.csls_neighbourhood]
            nbrhood_z[i:j] = (cp.mean(similarities, axis=1))

        for i in range(0, len(src), BATCH_SIZE):
            j = min(i + BATCH_SIZE, len(src))
            similarities = cp.transpose(
                cp.transpose(2 * cp.asarray(xw[src[i:j]]).dot(
                    cp.transpose(cp.asarray(zw)))) -
                nbrhood_x[src[i:j]]) - nbrhood_z
            nn = cp.argmax(similarities, axis=1).tolist()
            similarities = cp.argsort((similarities), axis=1)

            nn5 = (similarities[:, -5:])
            nn10 = (similarities[:, -10:])
            for k in range(j - i):
                translation[src[i + k]] = nn[k]
                translation5[src[i + k]] = nn5[k].tolist()
                translation10[src[i + k]] = nn10[k].tolist()
        accuracy = np.mean(
            [1 if translation[i] in src2trg[i] else 0 for i in src])
        mean = 0
        for i in src:
            for k in translation5[i]:
                if k in src2trg[i]:
                    mean += 1
                    break

        mean /= len(src)
        accuracy5 = mean

        mean = 0
        for i in src:
            for k in translation10[i]:
                if k in src2trg[i]:
                    mean += 1
                    break

        mean /= len(src)
        accuracy10 = mean
        drop_count += 1
        if accuracy > best_val_acc:
            if args.verbose:
                print('Improvement of {0}%  over best validation accuracy!'.
                      format((accuracy - best_val_acc) * 100))
            best_val_acc = accuracy
            best_add_src = list(add_src)
            best_add_trg = list(add_trg)
            drop_count = 0

        if args.verbose:
            print(
                'Val Set:- Coverage:{0:7.2%}  Accuracy:{1:7.2%}  Accuracy(Top 5):{2:7.2%}  Accuracy(Top 10):{3:7.2%}'
                .format(coverage, accuracy, accuracy5, accuracy10))
        if drop_count >= args.patience:
            if args.verbose:
                print('Training ended')
            break

        # Step 1.4: Dictionary Induction Stage (Bootstrap)
        # Consider x_cutoff and z_cutoff to be the vocabulary of the two languages(First k words of vocabulary are the most frequent words in the language(as per standard word embeddings)).
        # CSLS Inferencing will be performed on this vocabulary subset. Bidirectional bootstrapping is performed.
        # Dictionary entries for first "x_cutoff" words of Language-1 and for first "z-cutoff" words of Language-2 are inferred. Original training dictionary is also added.
        # Total dictionary size=x_cutoff+z_cutoff+size(train_set)
        if args.normalize_eval:
            xw = embeddings.length_normalize(xw)
            zw = embeddings.length_normalize(zw)

        x_vocab_size = min(xw.shape[0], args.x_cutoff)
        z_vocab_size = min(zw.shape[0], args.z_cutoff)
        t = time.time()
        nbrhood_x = cp.zeros(x_vocab_size)
        best_sim_x = cp.zeros(x_vocab_size)
        best_sim_x_csls = cp.zeros(x_vocab_size)
        nbrhood_z = cp.zeros(z_vocab_size)

        batch_num = 1
        for i in range(0, x_vocab_size, BATCH_SIZE):
            j = min(i + BATCH_SIZE, x_vocab_size)
            similarities = -1 * cp.partition(
                -1 * cp.dot(cp.asarray(xw[i:j]),
                            cp.transpose(cp.asarray(zw[:z_vocab_size]))),
                args.csls_neighbourhood - 1,
                axis=1)[:, :args.csls_neighbourhood]
            nbrhood_x[i:j] = (cp.mean(similarities, axis=1))
            best_sim_x[i:j] = (cp.max(similarities, axis=1))
            batch_num += 1

        batch_num = 1
        for i in range(0, z_vocab_size, BATCH_SIZE):
            j = min(i + BATCH_SIZE, z_vocab_size)
            similarities = -1 * cp.partition(
                -1 * cp.dot(cp.asarray(zw[i:j]),
                            cp.transpose(cp.asarray(xw[:x_vocab_size]))),
                args.csls_neighbourhood - 1,
                axis=1)[:, :args.csls_neighbourhood]
            nbrhood_z[i:j] = (cp.mean(similarities, axis=1))
            batch_num += 1

        src_indices = list(range(0, x_vocab_size))
        trg_indices = []
        batch_num = 1
        for i in range(0, x_vocab_size, BATCH_SIZE):
            j = min(i + BATCH_SIZE, x_vocab_size)
            similarities = cp.transpose(
                cp.transpose(2 * cp.asarray(xw[i:j]).dot(
                    cp.transpose(cp.asarray(zw[:z_vocab_size])))) -
                nbrhood_x[i:j]) - nbrhood_z
            nn = cp.argmax(similarities, axis=1).tolist()
            trg_indices.append(nn)
            batch_num += 1

        src_indices2 = []
        trg_indices2 = list(range(0, z_vocab_size))
        batch_num = 1
        for i in range(0, z_vocab_size, BATCH_SIZE):
            j = min(i + BATCH_SIZE, z_vocab_size)
            similarities = cp.transpose(
                cp.transpose(2 * cp.asarray(zw[i:j]).dot(
                    cp.transpose(cp.asarray(xw[:x_vocab_size])))) -
                nbrhood_z[i:j]) - nbrhood_x
            nn = cp.argmax(similarities, axis=1).tolist()
            src_indices2.append(nn)
            batch_num += 1
        trg_indices = [item for sublist in trg_indices for item in sublist]
        src_indices2 = [item for sublist in src_indices2 for item in sublist]

        add_src = list(src_indices + src_indices2)
        add_trg = list(trg_indices + trg_indices2)
        src_indices = src_indices + src_indices2 + orig_src
        trg_indices = trg_indices + trg_indices2 + orig_trg

    end_time = time.time()
    if args.verbose:
        print('Completed bootstrapping in {0:.2f} seconds'.format(end_time -
                                                                  start_time))

    # Step 2: Final Training with bootstrapped dictionary
    if args.verbose:
        print('Training final model')
    src_indices = best_add_src + src_indices_train
    trg_indices = best_add_trg + trg_indices_train
    x_count = len(set(src_indices))
    z_count = len(set(trg_indices))

    # Creating dictionary matrix from training set
    map_dict_src = {}
    map_dict_trg = {}
    I = 0
    uniq_src = []
    uniq_trg = []
    for i in range(len(src_indices)):
        if src_indices[i] not in map_dict_src.keys():
            map_dict_src[src_indices[i]] = I
            I += 1
            uniq_src.append(src_indices[i])
    J = 0
    for j in range(len(trg_indices)):
        if trg_indices[j] not in map_dict_trg.keys():
            map_dict_trg[trg_indices[j]] = J
            J += 1
            uniq_trg.append(trg_indices[j])

    np.random.seed(0)
    Lambda = args.l2_reg
    U1 = TT.matrix()
    U2 = TT.matrix()
    B = TT.matrix()
    X_tot = x[uniq_src].T.dot(x[uniq_src])
    Z_tot = z[uniq_trg].T.dot(z[uniq_trg])
    W = U1.dot(B.dot(U2.T))
    cost = (TT.nlinalg.trace(
        U2.dot(
            B.dot(
                U1.T.dot(
                    shared(X_tot).dot(U1.dot(B.dot(U2.T.dot(shared(Z_tot)))))))
        )) - 2 * TT.sum(
            (shared(x[src_indices]).dot(W)) * shared(z[trg_indices]))
            ) / len(src_indices) + 0.5 * Lambda * (TT.sum(B**2))
    solver = ConjugateGradient(maxtime=args.max_opt_time,
                               maxiter=args.max_opt_iter)

    low_rank = 300
    manifold = Product([
        Stiefel(x.shape[1], low_rank),
        Stiefel(z.shape[1], low_rank),
        PositiveDefinite(low_rank)
    ])
    problem = Problem(manifold=manifold,
                      cost=cost,
                      arg=[U1, U2, B],
                      verbosity=3)
    wopt = solver.solve(problem)

    w = wopt
    U1 = w[0]
    U2 = w[1]
    B = w[2]

    xw = x.dot(U1).dot(scipy.linalg.sqrtm(B))
    zw = z.dot(U2).dot(scipy.linalg.sqrtm(B))

    gc.collect()

    # Step 3: Evaluation
    if args.verbose:
        print('Beginning Evaluation')

    if args.normalize_eval:
        xw = embeddings.length_normalize(xw)
        zw = embeddings.length_normalize(zw)
    # Loading test dictionary
    f = open(args.dictionary_test,
             encoding=args.encoding,
             errors='surrogateescape')
    src2trg = collections.defaultdict(set)
    trg2src = collections.defaultdict(set)
    oov = set()
    vocab = set()
    for line in f:
        src, trg = line.split()
        if args.max_vocab:
            src = src.lower()
            trg = trg.lower()
        try:
            src_ind = src_word2ind[src]
            trg_ind = trg_word2ind[trg]
            src2trg[src_ind].add(trg_ind)
            trg2src[trg_ind].add(src_ind)
            vocab.add(src)
        except KeyError:
            oov.add(src)
    src = list(src2trg.keys())
    trgt = list(trg2src.keys())

    oov -= vocab  # If one of the translation options is in the vocabulary, then the entry is not an oov
    coverage = len(src2trg) / (len(src2trg) + len(oov))
    f.close()

    translation = collections.defaultdict(int)
    translation5 = collections.defaultdict(list)
    translation10 = collections.defaultdict(list)

    t = time.time()
    nbrhood_x = np.zeros(xw.shape[0])
    nbrhood_z = np.zeros(zw.shape[0])
    nbrhood_z2 = cp.zeros(zw.shape[0])
    for i in range(0, len(src), BATCH_SIZE):
        j = min(i + BATCH_SIZE, len(src))
        similarities = xw[src[i:j]].dot(zw.T)
        similarities_x = -1 * np.partition(
            -1 * similarities, args.csls_neighbourhood - 1, axis=1)
        nbrhood_x[src[i:j]] = np.mean(
            similarities_x[:, :args.csls_neighbourhood], axis=1)

    batch_num = 1
    for i in range(0, zw.shape[0], BATCH_SIZE):
        j = min(i + BATCH_SIZE, zw.shape[0])
        similarities = -1 * cp.partition(
            -1 * cp.dot(cp.asarray(zw[i:j]), cp.transpose(cp.asarray(xw))),
            args.csls_neighbourhood - 1,
            axis=1)[:, :args.csls_neighbourhood]
        nbrhood_z2[i:j] = (cp.mean(similarities, axis=1))
        batch_num += 1
    nbrhood_z = cp.asnumpy(nbrhood_z2)
    for i in range(0, len(src), BATCH_SIZE):
        j = min(i + BATCH_SIZE, len(src))
        similarities = xw[src[i:j]].dot(zw.T)
        similarities = np.transpose(
            np.transpose(2 * similarities) - nbrhood_x[src[i:j]]) - nbrhood_z
        nn = similarities.argmax(axis=1).tolist()
        similarities = np.argsort((similarities), axis=1)

        nn5 = (similarities[:, -5:])
        nn10 = (similarities[:, -10:])
        for k in range(j - i):
            translation[src[i + k]] = nn[k]
            translation5[src[i + k]] = nn5[k]
            translation10[src[i + k]] = nn10[k]
    accuracy = np.mean([1 if translation[i] in src2trg[i] else 0 for i in src])
    mean = 0
    for i in src:
        for k in translation5[i]:
            if k in src2trg[i]:
                mean += 1
                break

    mean /= len(src)
    accuracy5 = mean

    mean = 0
    for i in src:
        for k in translation10[i]:
            if k in src2trg[i]:
                mean += 1
                break

    mean /= len(src)
    accuracy10 = mean
    print(
        'Coverage:{0:7.2%}  Accuracy:{1:7.2%}  Accuracy(Top 5):{2:7.2%}  Accuracy(Top 10):{3:7.2%}'
        .format(coverage, accuracy, accuracy5, accuracy10))
示例#19
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(description='Map the source embeddings into the target embedding space')
    parser.add_argument('src_input', help='the input source embeddings')
    parser.add_argument('trg_input', help='the input target embeddings')
    parser.add_argument('--model_path', default=None, type=str, help='directory to save the model')
    parser.add_argument('--geomm_embeddings_path', default=None, type=str, help='directory to save the output GeoMM latent space embeddings. The output embeddings are normalized.')
    parser.add_argument('--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)')
    parser.add_argument('--max_vocab', default=0,type=int, help='Maximum vocabulary to be loaded, 0 allows complete vocabulary')
    parser.add_argument('--verbose', default=0,type=int, help='Verbose')
    mapping_group = parser.add_argument_group('mapping arguments', 'Basic embedding mapping arguments')
    mapping_group.add_argument('-dtrain', '--dictionary_train', default=sys.stdin.fileno(), help='the training dictionary file (defaults to stdin)')
    mapping_group.add_argument('-dtest', '--dictionary_test', default=sys.stdin.fileno(), help='the test dictionary file (defaults to stdin)')
    mapping_group.add_argument('--normalize', choices=['unit', 'center', 'unitdim', 'centeremb'], nargs='*', default=[], help='the normalization actions to perform in order')

    geomm_group = parser.add_argument_group('GeoMM arguments', 'Arguments for GeoMM method')
    geomm_group.add_argument('--l2_reg', type=float,default=1e2, help='Lambda for L2 Regularization')
    geomm_group.add_argument('--max_opt_time', type=int,default=5000, help='Maximum time limit for optimization in seconds')
    geomm_group.add_argument('--max_opt_iter', type=int,default=150, help='Maximum number of iterations for optimization')

    eval_group = parser.add_argument_group('evaluation arguments', 'Arguments for evaluation')
    eval_group.add_argument('--normalize_eval', action='store_true', help='Normalize the embeddings at test time')
    eval_group.add_argument('--eval_batch_size', type=int,default=1000, help='Batch size for evaluation')
    eval_group.add_argument('--csls_neighbourhood', type=int,default=10, help='Neighbourhood size for CSLS')

    args = parser.parse_args()
    BATCH_SIZE = args.eval_batch_size

    ## Logging
    #method_name = os.path.join('logs','geomm')
    #directory = os.path.join(os.path.join(os.getcwd(),method_name), datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
    #if not os.path.exists(directory):
    #    os.makedirs(directory)
    #log_file_name, file_extension = os.path.splitext(os.path.basename(args.dictionary_train))
    #log_file_name = log_file_name + '.log'
    #class Logger(object):
    #    def __init__(self):
    #        self.terminal = sys.stdout
    #        self.log = open(os.path.join(directory,log_file_name), "a")

    #    def write(self, message):
    #        self.terminal.write(message)
    #        self.log.write(message)

    #    def flush(self):
    #        #this flush method is needed for python 3 compatibility.
    #        #this handles the flush command by doing nothing.
    #        #you might want to specify some extra behavior here.
    #        pass
    #sys.stdout = Logger()
    if args.verbose:
        print('Current arguments: {0}'.format(args))

    dtype = 'float32'
    if args.verbose:
        print('Loading train data...')
    # Read input embeddings
    srcfile = open(args.src_input, encoding=args.encoding, errors='surrogateescape')
    trgfile = open(args.trg_input, encoding=args.encoding, errors='surrogateescape')
    src_words, x = embeddings.read(srcfile,max_voc=args.max_vocab, dtype=dtype)
    trg_words, z = embeddings.read(trgfile,max_voc=args.max_vocab, dtype=dtype)

    # Build word to index map
    src_word2ind = {word: i for i, word in enumerate(src_words)}
    trg_word2ind = {word: i for i, word in enumerate(trg_words)}

    # Build training dictionary
    noov=0
    src_indices = []
    trg_indices = []
    f = open(args.dictionary_train, encoding=args.encoding, errors='surrogateescape')
    for line in f:
        src,trg = line.split()
        if args.max_vocab:
            src=src.lower()
            trg=trg.lower()
        try:
            src_ind = src_word2ind[src]
            trg_ind = trg_word2ind[trg]
            src_indices.append(src_ind)
            trg_indices.append(trg_ind)
        except KeyError:
            noov+=1
            if args.verbose:
                print('WARNING: OOV dictionary entry ({0} - {1})'.format(src, trg)) #, file=sys.stderr
    f.close()
    if args.verbose:
        print('Number of training pairs having at least one OOV: {}'.format(noov))
    src_indices = src_indices
    trg_indices = trg_indices
    if args.verbose:
        print('Normalizing embeddings...')
    # STEP 0: Normalization
    for action in args.normalize:
        if action == 'unit':
            x = embeddings.length_normalize(x)
            z = embeddings.length_normalize(z)
        elif action == 'center':
            x = embeddings.mean_center(x)
            z = embeddings.mean_center(z)
        elif action == 'unitdim':
            x = embeddings.length_normalize_dimensionwise(x)
            z = embeddings.length_normalize_dimensionwise(z)
        elif action == 'centeremb':
            x = embeddings.mean_center_embeddingwise(x)
            z = embeddings.mean_center_embeddingwise(z)


    # Step 1: Optimization
    if args.verbose:
        print('Beginning Optimization')
    start_time = time.time()
    x_count = len(set(src_indices))
    z_count = len(set(trg_indices))
    A = np.zeros((x_count,z_count))

    # Creating dictionary matrix from training set
    map_dict_src={}
    map_dict_trg={}
    I=0
    uniq_src=[]
    uniq_trg=[]
    for i in range(len(src_indices)):
        if src_indices[i] not in map_dict_src.keys():
            map_dict_src[src_indices[i]]=I
            I+=1
            uniq_src.append(src_indices[i])
    J=0
    for j in range(len(trg_indices)):
        if trg_indices[j] not in map_dict_trg.keys():
            map_dict_trg[trg_indices[j]]=J
            J+=1
            uniq_trg.append(trg_indices[j])

    for i in range(len(src_indices)):
        A[map_dict_src[src_indices[i]],map_dict_trg[trg_indices[i]]]=1

    np.random.seed(0)
    Lambda=args.l2_reg

    U1 = TT.matrix()
    U2 = TT.matrix()
    B  = TT.matrix()

    Kx, Kz = x[uniq_src], z[uniq_trg]
    XtAZ = Kx.T.dot(A.dot(Kz))
    XtX = Kx.T.dot(Kx)
    ZtZ = Kz.T.dot(Kz)
    # AA = np.sum(A*A) # this can be added if cost needs to be compared to original geomm

    W = (U1.dot(B)).dot(U2.T)
    regularizer = 0.5*Lambda*(TT.sum(B**2))
    sXtX = shared(XtX)
    sZtZ = shared(ZtZ)
    sXtAZ = shared(XtAZ)

    cost = regularizer
    wtxtxw = W.T.dot(sXtX.dot(W))
    wtxtxwztz = wtxtxw.dot(sZtZ)
    cost += TT.nlinalg.trace(wtxtxwztz)
    cost += -2 * TT.sum(W * sXtAZ)
    # cost += shared(AA) # this can be added if cost needs to be compared with original geomm

    solver = ConjugateGradient(maxtime=args.max_opt_time,maxiter=args.max_opt_iter)

    manifold =Product([Stiefel(x.shape[1], x.shape[1]),Stiefel(z.shape[1], x.shape[1]),PositiveDefinite(x.shape[1])])
    #manifold =Product([Stiefel(x.shape[1], 200),Stiefel(z.shape[1], 200),PositiveDefinite(200)])
    problem = Problem(manifold=manifold, cost=cost, arg=[U1,U2,B], verbosity=3)
    wopt = solver.solve(problem)

    w= wopt
    U1 = w[0]
    U2 = w[1]
    B = w[2]

    ### Save the models if requested
    if args.model_path is not None:
        os.makedirs(args.model_path,exist_ok=True)
        np.savetxt('{}/U_src.csv'.format(args.model_path),U1)
        np.savetxt('{}/U_tgt.csv'.format(args.model_path),U2)
        np.savetxt('{}/B.csv'.format(args.model_path),B)

    # Step 2: Transformation
    xw = x.dot(U1).dot(scipy.linalg.sqrtm(B))
    zw = z.dot(U2).dot(scipy.linalg.sqrtm(B))

    end_time = time.time()
    if args.verbose:
        print('Completed training in {0:.2f} seconds'.format(end_time-start_time))
    gc.collect()

    ### Save the GeoMM embeddings if requested
    xw_n = embeddings.length_normalize(xw)
    zw_n = embeddings.length_normalize(zw)
    if args.geomm_embeddings_path is not None:
        os.makedirs(args.geomm_embeddings_path,exist_ok=True)

        out_emb_fname=os.path.join(args.geomm_embeddings_path,'src.vec')
        with open(out_emb_fname,'w',encoding=args.encoding) as outfile:
            embeddings.write(src_words,xw_n,outfile)

        out_emb_fname=os.path.join(args.geomm_embeddings_path,'trg.vec')
        with open(out_emb_fname,'w',encoding=args.encoding) as outfile:
            embeddings.write(trg_words,zw_n,outfile)

    # Step 3: Evaluation
    if args.normalize_eval:
        xw = xw_n
        zw = zw_n

    X = xw[src_indices]
    Z = zw[trg_indices]

    # Loading test dictionary
    f = open(args.dictionary_test, encoding=args.encoding, errors='surrogateescape')
    src2trg = collections.defaultdict(set)
    trg2src = collections.defaultdict(set)
    oov = set()
    vocab = set()
    for line in f:
        src, trg = line.split()
        if args.max_vocab:
            src=src.lower()
            trg=trg.lower()
        try:
            src_ind = src_word2ind[src]
            trg_ind = trg_word2ind[trg]
            src2trg[src_ind].add(trg_ind)
            trg2src[trg_ind].add(src_ind)
            vocab.add(src)
        except KeyError:
            oov.add(src)
    src = list(src2trg.keys())
    trgt = list(trg2src.keys())

    oov -= vocab  # If one of the translation options is in the vocabulary, then the entry is not an oov
    coverage = len(src2trg) / (len(src2trg) + len(oov))
    f.close()

    translation = collections.defaultdict(int)
    translation5 = collections.defaultdict(list)
    translation10 = collections.defaultdict(list)

    ### compute nearest neigbours of x in z
    t=time.time()
    nbrhood_x=np.zeros(xw.shape[0])

    for i in range(0, len(src), BATCH_SIZE):
        j = min(i + BATCH_SIZE, len(src))
        similarities = xw[src[i:j]].dot(zw.T)
        similarities_x = -1*np.partition(-1*similarities,args.csls_neighbourhood-1 ,axis=1)
        nbrhood_x[src[i:j]]=np.mean(similarities_x[:,:args.csls_neighbourhood],axis=1)

    ### compute nearest neigbours of z in x (GPU version)
    nbrhood_z=np.zeros(zw.shape[0])
    with cp.cuda.Device(0):
        nbrhood_z2=cp.zeros(zw.shape[0])
        batch_num=1
        for i in range(0, zw.shape[0], BATCH_SIZE):
            j = min(i + BATCH_SIZE, zw.shape[0])
            similarities = -1*cp.partition(-1*cp.dot(cp.asarray(zw[i:j]),cp.transpose(cp.asarray(xw))),args.csls_neighbourhood-1 ,axis=1)[:,:args.csls_neighbourhood]
            nbrhood_z2[i:j]=(cp.mean(similarities[:,:args.csls_neighbourhood],axis=1))
            batch_num+=1
        nbrhood_z=cp.asnumpy(nbrhood_z2)

    #### compute nearest neigbours of z in x (CPU version)
    #nbrhood_z=np.zeros(zw.shape[0])
    #for i in range(0, len(zw.shape[0]), BATCH_SIZE):
    #    j = min(i + BATCH_SIZE, len(zw.shape[0]))
    #    similarities = zw[i:j].dot(xw.T)
    #    similarities_z = -1*np.partition(-1*similarities,args.csls_neighbourhood-1 ,axis=1)
    #    nbrhood_z[i:j]=np.mean(similarities_z[:,:args.csls_neighbourhood],axis=1)

    #### find translation
    #for i in range(0, len(src), BATCH_SIZE):
    #    j = min(i + BATCH_SIZE, len(src))
    #    similarities = xw[src[i:j]].dot(zw.T)
    #    similarities = np.transpose(np.transpose(2*similarities) - nbrhood_x[src[i:j]]) - nbrhood_z
    #    nn = similarities.argmax(axis=1).tolist()
    #    similarities = np.argsort((similarities),axis=1)

    #    nn5 = (similarities[:,-5:])
    #    nn10 = (similarities[:,-10:])
    #    for k in range(j-i):
    #        translation[src[i+k]] = nn[k]
    #        translation5[src[i+k]] = nn5[k]
    #        translation10[src[i+k]] = nn10[k]


    #if args.geomm_embeddings_path is not None:
    #    delim=','
    #    os.makedirs(args.geomm_embeddings_path,exist_ok=True)

    #    translations_fname=os.path.join(args.geomm_embeddings_path,'translations.csv')
    #    with open(translations_fname,'w',encoding=args.encoding) as translations_file:
    #        for src_id in src:
    #            src_word = src_words[src_id]
    #            all_trg_words = [ trg_words[trg_id] for trg_id in src2trg[src_id] ]
    #            trgout_words = [ trg_words[j] for j in translation10[src_id] ]
    #            ss = list(nn10[src_id,:])
    #
    #            p1 = ':'.join(all_trg_words)
    #            p2 = delim.join( [ '{}{}{}'.format(w,delim,s) for w,s in zip(trgout_words,ss) ] )
    #            translations_file.write( '{s}{delim}{p1}{delim}{p2}\n'.format(s=src_word, delim=delim, p1=p1, p2=p2) )

    ### find translation  (and write to file if output requested)
    delim=','
    translations_file =None
    if args.geomm_embeddings_path is not None:
        os.makedirs(args.geomm_embeddings_path,exist_ok=True)
        translations_fname=os.path.join(args.geomm_embeddings_path,'translations.csv')
        translations_file = open(translations_fname,'w',encoding=args.encoding)

    for i in range(0, len(src), BATCH_SIZE):
        j = min(i + BATCH_SIZE, len(src))
        similarities = xw[src[i:j]].dot(zw.T)
        similarities = np.transpose(np.transpose(2*similarities) - nbrhood_x[src[i:j]]) - nbrhood_z
        nn = similarities.argmax(axis=1).tolist()
        similarities = np.argsort((similarities),axis=1)

        nn5 = (similarities[:,-5:])
        nn10 = (similarities[:,-10:])
        for k in range(j-i):
            translation[src[i+k]] = nn[k]
            translation5[src[i+k]] = nn5[k]
            translation10[src[i+k]] = nn10[k]


            if args.geomm_embeddings_path is not None:
                src_id=src[i+k]
                src_word = src_words[src_id]
                all_trg_words = [ trg_words[trg_id] for trg_id in src2trg[src_id] ]
                trgout_words = [ trg_words[j] for j in translation10[src_id] ]
                #ss = list(nn10[src_id,:])

                p1 = ':'.join(all_trg_words)
                p2 = ':'.join(trgout_words)
                #p2 = delim.join( [ '{}{}{}'.format(w,delim,s) for w,s in zip(trgout_words,ss) ] )
                translations_file.write( '{s}{delim}{p1}{delim}{p2}\n'.format(s=src_word, p1=p1, p2=p2, delim=delim) )

    if args.geomm_embeddings_path is not None:
        translations_file.close()

    accuracy = np.mean([1 if translation[i] in src2trg[i] else 0 for i in src])
    mean=0
    for i in src:
        for k in translation5[i]:
            if k in src2trg[i]:
                mean+=1
                break

    mean/=len(src)
    accuracy5 = mean

    mean=0
    for i in src:
        for k in translation10[i]:
            if k in src2trg[i]:
                mean+=1
                break

    mean/=len(src)
    accuracy10 = mean
    message = src_input.split(".")[-2] + "-->" + trg_input.split(".")[-2] + ":"
        'Coverage:{0:7.2%}  Accuracy:{1:7.2%}'.format(coverage, accuracy)
示例#20
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(
        description=
        'Evaluate embeddings of two languages in a shared space in word translation induction'
    )
    parser.add_argument('src_embeddings',
                        help='the source language embeddings')
    parser.add_argument('trg_embeddings',
                        help='the target language embeddings')
    parser.add_argument('-d',
                        '--dictionary',
                        default=sys.stdin.fileno(),
                        help='the test dictionary file (defaults to stdin)')
    parser.add_argument(
        '--retrieval',
        default='nn',
        choices=['nn', 'invnn', 'invsoftmax', 'csls', 'fcsls'],
        help=
        'the retrieval method (nn: standard nearest neighbor; invnn: inverted nearest neighbor; invsoftmax: inverted softmax; csls: cross-domain similarity local scaling)'
    )
    parser.add_argument(
        '--inv_temperature',
        default=1,
        type=float,
        help='the inverse temperature (only compatible with inverted softmax)')
    parser.add_argument(
        '--inv_sample',
        default=None,
        type=int,
        help=
        'use a random subset of the source vocabulary for the inverse computations (only compatible with inverted softmax)'
    )
    parser.add_argument(
        '--neighborhood',
        default=10,
        type=int,
        help='the neighborhood size (only compatible with csls)')
    parser.add_argument('--nbest',
                        default=3,
                        type=int,
                        help='number of candidates to get')
    parser.add_argument(
        '--dot',
        action='store_true',
        help=
        'use the dot product in the similarity computations instead of the cosine'
    )
    parser.add_argument('--verbose',
                        action='store_true',
                        help='verbose, print more information')
    parser.add_argument(
        '--encoding',
        default='utf-8',
        help='the character encoding for input/output (defaults to utf-8)')
    parser.add_argument('--seed', type=int, default=0, help='the random seed')
    parser.add_argument('--precision',
                        choices=['fp16', 'fp32', 'fp64'],
                        default='fp32',
                        help='the floating-point precision (defaults to fp32)')
    parser.add_argument('--vocabulary_cutoff',
                        default=0,
                        type=int,
                        help='vocab limit for reading the embedding')
    parser.add_argument('--cuda',
                        action='store_true',
                        help='use cuda (requires cupy)')
    args = parser.parse_args()

    # Choose the right dtype for the desired precision
    if args.precision == 'fp16':
        dtype = 'float16'
    elif args.precision == 'fp32':
        dtype = 'float32'
    elif args.precision == 'fp64':
        dtype = 'float64'

    # Read input embeddings
    srcfile = open(args.src_embeddings,
                   encoding=args.encoding,
                   errors='surrogateescape')
    trgfile = open(args.trg_embeddings,
                   encoding=args.encoding,
                   errors='surrogateescape')
    src_words, x = embeddings.read(srcfile,
                                   dtype=dtype,
                                   threshold=args.vocabulary_cutoff)
    trg_words, z = embeddings.read(trgfile,
                                   dtype=dtype,
                                   threshold=args.vocabulary_cutoff)

    # NumPy/CuPy management
    if args.cuda:
        if not supports_cupy():
            print('ERROR: Install CuPy for CUDA support', file=sys.stderr)
            sys.exit(-1)
        xp = get_cupy()
        x = xp.asarray(x)
        z = xp.asarray(z)
    else:
        xp = np
    xp.random.seed(args.seed)

    # Length normalize embeddings so their dot product effectively computes the cosine similarity
    if not args.dot:
        embeddings.length_normalize(x)
        embeddings.length_normalize(z)

    # Build word to index map
    src_word2ind = {word: i for i, word in enumerate(src_words)}
    trg_ind2word = {i: word for i, word in enumerate(trg_words)}
    src_ind2word = {i: word for i, word in enumerate(src_words)}

    # Read dictionary and compute coverage
    f = open(args.dictionary, encoding=args.encoding, errors='surrogateescape')
    oov = set()
    vocab = set()
    src = []
    for line in f:
        if '\t' in line:
            w, _ = line.split('\t')
        elif ' ' in line:
            w, _ = line.split(' ')
        else:
            w = line.strip()
        if w in vocab:
            continue
        try:
            src.append(src_word2ind[w])
            vocab.add(w)
        except KeyError:
            oov.add(w)

    if args.verbose:
        print(f'{len(oov)} oovs: ' + '|'.join(list(oov)[:10]), file=sys.stderr)

    if args.retrieval == 'nn':  # Standard nearest neighbor
        queries = x[src]
        topvals, topinds = embeddings.faiss_knn(queries, z, k=args.nbest)
        for i, wind in enumerate(src):
            w = src_ind2word[wind]
            for k, tind in enumerate(topinds[i]):
                wt = trg_ind2word[tind]
                st = topvals[i, k]
                print(f'{w}\t{wt}\t{st:.3f}')
    elif args.retrieval == 'fcsls':  # Cross-domain similarity local scaling
        sim_bwd, _ = embeddings.faiss_knn(z, x, k=args.neighborhood)
        knn_sim_bwd = sim_bwd.mean(axis=1)
        queries = x[src]
        topvals, topinds = embeddings.faiss_knn(queries, z, k=30)
        for i, wind in enumerate(src):
            w = src_ind2word[wind]
            for k, tind in enumerate(topinds[i]):
                wt = trg_ind2word[tind]
                st = 2 * topvals[i, k] - knn_sim_bwd[topinds[i, k]]
                print(f'{w}\t{wt}\t{st:.3f}')
    elif args.retrieval == 'csls':  # Cross-domain similarity local scaling
        sim_bwd, _ = embeddings.faiss_knn(z, x, k=args.neighborhood)
        knn_sim_bwd = sim_bwd.mean(axis=1)
        queries = x[src]

        for i in range(0, len(src), BATCH_SIZE):
            j = min(i + BATCH_SIZE, len(src))
            similarities = 2 * x[src[i:j]].dot(
                z.T) - knn_sim_bwd  # Equivalent to the real CSLS scores for NN
            nn = (-similarities).argpartition(args.nbest, axis=1)
            for k in range(j - i):
                w = src_ind2word[src[i + k]]
                for tind in nn[k, :args.nbest]:
                    wt = trg_ind2word[tind]
                    st = similarities[k, tind]
                    print(f'{w}\t{wt}\t{st:.3f}')
示例#21
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(
        description='Evaluate embeddings in word analogy')
    parser.add_argument('embeddings', help='the word embeddings')
    parser.add_argument(
        '-t',
        '--threshold',
        type=int,
        default=0,
        help=
        'reduce vocabulary of the model for fast approximate evaluation (0 = off, otherwise typical value is 30,000)'
    )
    parser.add_argument('-i',
                        '--input',
                        default=sys.stdin.fileno(),
                        help='the test file (defaults to stdin)')
    parser.add_argument('-v',
                        '--verbose',
                        action='store_true',
                        help='verbose output (give category specific results)')
    parser.add_argument('-l',
                        '--lowercase',
                        action='store_true',
                        help='lowercase the words in the test file')
    parser.add_argument(
        '--encoding',
        default='utf-8',
        help='the character encoding for input/output (defaults to utf-8)')
    parser.add_argument('--precision',
                        choices=['fp16', 'fp32', 'fp64'],
                        default='fp32',
                        help='the floating-point precision (defaults to fp32)')
    args = parser.parse_args()

    # Choose the right dtype for the desired precision
    if args.precision == 'fp16':
        dtype = 'float16'
    elif args.precision == 'fp32':
        dtype = 'float32'
    elif args.precision == 'fp64':
        dtype = 'float64'

    # Read input embeddings
    f = open(args.embeddings, encoding=args.encoding, errors='surrogateescape')
    words, matrix = embeddings.read(f, threshold=args.threshold, dtype=dtype)

    # Build word to index map
    word2ind = {word: i for i, word in enumerate(words)}

    # Length normalize embeddings
    embeddings.length_normalize(matrix)

    # Parse test file
    f = open(args.input, encoding=args.encoding, errors='surrogateescape')
    categories = []
    src1 = []
    trg1 = []
    src2 = []
    trg2 = []
    for line in f:
        if line.startswith(': '):
            name = line[2:-1]
            is_syntactic = name.startswith('gram')
            categories.append({
                'name': name,
                'is_syntactic': is_syntactic,
                'total': 0,
                'oov': 0
            })
        else:
            try:
                ind = [
                    word2ind[word.lower() if args.lowercase else word]
                    for word in line.split()
                ]
                src1.append(ind[0])
                trg1.append(ind[1])
                src2.append(ind[2])
                trg2.append(ind[3])
                categories[-1]['total'] += 1
            except KeyError:
                categories[-1]['oov'] += 1
    total = len(src1)

    # Compute nearest neighbors using efficient matrix multiplication
    nn = []
    for i in range(0, total, BATCH_SIZE):
        j = min(i + BATCH_SIZE, total)
        similarities = (matrix[src2[i:j]] - matrix[src1[i:j]] +
                        matrix[trg1[i:j]]).dot(matrix.T)
        similarities[range(j - i), src1[i:j]] = -1
        similarities[range(j - i), trg1[i:j]] = -1
        similarities[range(j - i), src2[i:j]] = -1
        nn += np.argmax(similarities, axis=1).tolist()
    nn = np.array(nn)

    # Compute and print accuracies
    semantic = {'correct': 0, 'total': 0, 'oov': 0}
    syntactic = {'correct': 0, 'total': 0, 'oov': 0}
    ind = 0
    for category in categories:
        current = syntactic if category['is_syntactic'] else semantic
        correct = np.sum(nn[ind:ind +
                            category['total']] == trg2[ind:ind +
                                                       category['total']])
        current['correct'] += correct
        current['total'] += category['total']
        current['oov'] += category['oov']
        ind += category['total']
        if args.verbose:
            print('Coverage:{0:7.2%}  Accuracy:{1:7.2%} | {2}'.format(
                category['total'] / (category['total'] + category['oov']),
                correct / category['total'], category['name']))
    if args.verbose:
        print('-' * 80)
    print('Coverage:{0:7.2%}  Accuracy:{1:7.2%} (sem:{2:7.2%}, syn:{3:7.2%})'.
          format((semantic['total'] + syntactic['total']) /
                 (semantic['total'] + syntactic['total'] + semantic['oov'] +
                  syntactic['oov']),
                 (semantic['correct'] + syntactic['correct']) /
                 (semantic['total'] + syntactic['total']),
                 semantic['correct'] / semantic['total'],
                 syntactic['correct'] / syntactic['total']))
示例#22
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(description='Map the source embeddings into the target embedding space')
    parser.add_argument('emb_file', help='the input target embeddings')
    parser.add_argument('--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)')
    parser.add_argument('--max_vocab', default=0,type=int, help='Maximum vocabulary to be loaded, 0 allows complete vocabulary')
    parser.add_argument('--verbose', default=0,type=int, help='Verbose')
  
    mapping_group = parser.add_argument_group('mapping arguments', 'Basic embedding mapping arguments')
    mapping_group.add_argument('-dtrain_file', '--dictionary_train_file', default=sys.stdin.fileno(), help='the training dictionary file (defaults to stdin)')
    mapping_group.add_argument('-dtest_file', '--dictionary_test_file', default=sys.stdin.fileno(), help='the test dictionary file (defaults to stdin)')
    mapping_group.add_argument('--normalize', choices=['unit', 'center', 'unitdim', 'centeremb'], nargs='*', default=[], help='the normalization actions to perform in order')
    
    geomm_group = parser.add_argument_group('GeoMM Multi arguments', 'Arguments for GeoMM Multi method')
    geomm_group.add_argument('--l2_reg', type=float,default=1e3, help='Lambda for L2 Regularization')
    geomm_group.add_argument('--max_opt_time', type=int,default=5000, help='Maximum time limit for optimization in seconds')
    geomm_group.add_argument('--max_opt_iter', type=int,default=150, help='Maximum number of iterations for optimization')
   
    eval_group = parser.add_argument_group('evaluation arguments', 'Arguments for evaluation')
    eval_group.add_argument('--normalize_eval', action='store_true', help='Normalize the embeddings at test time')
    eval_group.add_argument('--eval_batch_size', type=int,default=1000, help='Batch size for evaluation')
    eval_group.add_argument('--csls_neighbourhood', type=int,default=10, help='Neighbourhood size for CSLS')

    args = parser.parse_args()
    BATCH_SIZE = args.eval_batch_size

    # Logging
    method_name = os.path.join('logs','geomm_multi')
    directory = os.path.join(os.path.join(os.getcwd(),method_name), datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
    if not os.path.exists(directory):
        os.makedirs(directory)
    log_file_name, file_extension = os.path.splitext(os.path.basename(args.dictionary_train_file))
    log_file_name = log_file_name + '.log'
    class Logger(object):
        def __init__(self):
            self.terminal = sys.stdout
            self.log = open(os.path.join(directory,log_file_name), "a")

        def write(self, message):
            self.terminal.write(message)
            self.log.write(message)  

        def flush(self):
            #this flush method is needed for python 3 compatibility.
            #this handles the flush command by doing nothing.
            #you might want to specify some extra behavior here.
            pass    
    sys.stdout = Logger()
    if args.verbose:
        print('Current arguments: {0}'.format(args))

    dtype = 'float32'

    if args.verbose:
        print('Loading train data...')
    words = []
    emb = []
    with open(args.emb_file, encoding=args.encoding, errors='surrogateescape') as f:
        for line in f:
            srcfile = open(line.strip(), encoding=args.encoding, errors='surrogateescape')
            words_temp, x_temp = embeddings.read(srcfile,max_voc=args.max_vocab, dtype=dtype)
            words.append(words_temp)
            emb.append(x_temp)


    # Build word to index map
    word2ind = []
    for lang in words:
        word2ind.append({word: i for i, word in enumerate(lang)})

    # Build training dictionary
    train_pairs = []
    with open(args.dictionary_train_file, encoding=args.encoding, errors='surrogateescape') as ff:
        for line in ff:
            vals = line.split(',')
            curr_dict=[int(vals[0].strip()),int(vals[1].strip())]
            src_indices = []
            trg_indices = []
            with open(vals[2].strip(), encoding=args.encoding, errors='surrogateescape') as f:
                for line in f:
                    src,trg = line.split()
                    if args.max_vocab:
                        src=src.lower()
                        trg=trg.lower()
                    try:
                        src_ind = word2ind[curr_dict[0]][src]
                        trg_ind = word2ind[curr_dict[1]][trg]
                        src_indices.append(src_ind)
                        trg_indices.append(trg_ind)
                    except KeyError:
                        if args.verbose:
                            print('WARNING: OOV dictionary entry ({0} - {1})'.format(src, trg), file=sys.stderr)
            curr_dict.append(src_indices)
            curr_dict.append(trg_indices)
            train_pairs.append(curr_dict)
    if args.verbose:
        print('Normalizing embeddings...')
    # Step 0: Normalization
    for action in args.normalize:
        if action == 'unit':
            for i in range(len(emb)):
                emb[i] = embeddings.length_normalize(emb[i])
        elif action == 'center':
            for i in range(len(emb)):
                emb[i] = embeddings.mean_center(emb[i])
        elif action == 'unitdim':
            for i in range(len(emb)):
                emb[i] = embeddings.length_normalize_dimensionwise(emb[i])
        elif action == 'centeremb':
            for i in range(len(emb)):
                emb[i] = embeddings.mean_center_embeddingwise(emb[i])


    # Step 1: Optimization
    if args.verbose:
        print('Beginning Optimization')
    start_time = time.time()
    mean_size=0
    for tp in range(len(train_pairs)):
        src_indices = train_pairs[tp][2]
        trg_indices = train_pairs[tp][3]
        x_count = len(set(src_indices))
        z_count = len(set(trg_indices))
        A = np.zeros((x_count,z_count))
        
        # Creating dictionary matrix from training set
        map_dict_src={}
        map_dict_trg={}
        I=0
        uniq_src=[]
        uniq_trg=[]
        for i in range(len(src_indices)):
            if src_indices[i] not in map_dict_src.keys():
                map_dict_src[src_indices[i]]=I
                I+=1
                uniq_src.append(src_indices[i])
        J=0
        for j in range(len(trg_indices)):
            if trg_indices[j] not in map_dict_trg.keys():
                map_dict_trg[trg_indices[j]]=J
                J+=1
                uniq_trg.append(trg_indices[j])

        for i in range(len(src_indices)):
            A[map_dict_src[src_indices[i]],map_dict_trg[trg_indices[i]]]=1
        train_pairs[tp].append(uniq_src)
        train_pairs[tp].append(uniq_trg)
        train_pairs[tp].append(A)
        mean_size+= (len(uniq_src)*len(uniq_trg))
    mean_size = mean_size/len(train_pairs)
    np.random.seed(0)
    Lambda=args.l2_reg

    variables=[]
    manif = []
    low_rank=emb[0].shape[1]
    for i in range(len(emb)):
        variables.append(TT.matrix())
        manif.append(Stiefel(emb[i].shape[1],low_rank))
    variables.append(TT.matrix())
    manif.append(PositiveDefinite(low_rank))
    B = variables[-1]
    cost = 0.5*Lambda*(TT.sum(B**2))
    for i in range(len(train_pairs)):
        x = emb[train_pairs[i][0]]
        z = emb[train_pairs[i][1]]
        U1 = variables[train_pairs[i][0]]
        U2 = variables[train_pairs[i][1]]
        cost = cost + TT.sum(((shared(x[train_pairs[i][4]]).dot(U1.dot(B.dot(U2.T)))).dot(shared(z[train_pairs[i][5]]).T)-shared(train_pairs[i][6]))**2)/float(len(train_pairs[i][2]))
    solver = ConjugateGradient(maxtime=args.max_opt_time,maxiter=args.max_opt_iter,mingradnorm=1e-12)
    manifold =Product(manif)
    problem = Problem(manifold=manifold, cost=cost, arg=variables, verbosity=3)
    wopt = solver.solve(problem)
    w= wopt
    U1 = w[0]
    U2 = w[1]
    B = w[2]

    # Step 2: Transformation
    Bhalf = scipy.linalg.sqrtm(wopt[-1])
    test_emb = []
    for i in range(len(emb)):
        test_emb.append(emb[i].dot(wopt[i]).dot(Bhalf))

    end_time = time.time()
    if args.verbose:
        print('Completed training in {0:.2f} seconds'.format(end_time-start_time))
    gc.collect()


    # Step 3: Evaluation
    if args.verbose:
        print('Beginning Evaluation')

    if args.normalize_eval:
        for i in range(len(test_emb)):
            test_emb[i] = embeddings.length_normalize(test_emb[i])

    # Loading test dictionary
    with open(args.dictionary_test_file, encoding=args.encoding, errors='surrogateescape') as ff:
        for line in ff:
            vals = line.split(',')
            curr_dict=[int(vals[0].strip()),int(vals[1].strip())]
            with open(vals[2].strip(), encoding=args.encoding, errors='surrogateescape') as f:
                src_word2ind = word2ind[curr_dict[0]]
                trg_word2ind = word2ind[curr_dict[1]]
                xw = test_emb[curr_dict[0]]
                zw = test_emb[curr_dict[1]]
                src2trg = collections.defaultdict(set)
                trg2src = collections.defaultdict(set)
                oov = set()
                vocab = set()
                for line in f:
                    src, trg = line.split()
                    if args.max_vocab:
                        src=src.lower()
                        trg=trg.lower()
                    try:
                        src_ind = src_word2ind[src]
                        trg_ind = trg_word2ind[trg]
                        src2trg[src_ind].add(trg_ind)
                        trg2src[trg_ind].add(src_ind)
                        vocab.add(src)
                    except KeyError:
                        oov.add(src)
                src = list(src2trg.keys())
                trgt = list(trg2src.keys())

                oov -= vocab  # If one of the translation options is in the vocabulary, then the entry is not an oov
                coverage = len(src2trg) / (len(src2trg) + len(oov))
                f.close()

                translation = collections.defaultdict(int)
                translation5 = collections.defaultdict(list)
                translation10 = collections.defaultdict(list)

                t=time.time()
                nbrhood_x=np.zeros(xw.shape[0])
                nbrhood_z=np.zeros(zw.shape[0])
                nbrhood_z2=cp.zeros(zw.shape[0])
                for i in range(0, len(src), BATCH_SIZE):
                    j = min(i + BATCH_SIZE, len(src))
                    similarities = xw[src[i:j]].dot(zw.T)
                    similarities_x = -1*np.partition(-1*similarities,args.csls_neighbourhood-1 ,axis=1)
                    nbrhood_x[src[i:j]]=np.mean(similarities_x[:,:args.csls_neighbourhood],axis=1)
                batch_num=1
                for i in range(0, zw.shape[0], BATCH_SIZE):
                    j = min(i + BATCH_SIZE, zw.shape[0])
                    similarities = -1*cp.partition(-1*cp.dot(cp.asarray(zw[i:j]),cp.transpose(cp.asarray(xw))),args.csls_neighbourhood-1 ,axis=1)[:,:args.csls_neighbourhood]
                    nbrhood_z2[i:j]=(cp.mean(similarities[:,:args.csls_neighbourhood],axis=1))
                    batch_num+=1
                nbrhood_z=cp.asnumpy(nbrhood_z2)
                for i in range(0, len(src), BATCH_SIZE):
                    j = min(i + BATCH_SIZE, len(src))
                    similarities = xw[src[i:j]].dot(zw.T)
                    similarities = np.transpose(np.transpose(2*similarities) - nbrhood_x[src[i:j]])- nbrhood_z
                    nn = similarities.argmax(axis=1).tolist()
                    similarities = np.argsort((similarities),axis=1)

                    nn5 = (similarities[:,-5:])
                    nn10 = (similarities[:,-10:])
                    for k in range(j-i):
                        translation[src[i+k]] = nn[k]
                        translation5[src[i+k]] = nn5[k]
                        translation10[src[i+k]] = nn10[k]
                accuracy = np.mean([1 if translation[i] in src2trg[i] else 0 for i in src])
                mean=0
                for i in src:
                    for k in translation5[i]:
                        if k in src2trg[i]:
                            mean+=1
                            break

                mean/=len(src)
                accuracy5 = mean

                mean=0
                for i in src:
                    for k in translation10[i]:
                        if k in src2trg[i]:
                            mean+=1
                            break

                mean/=len(src)
                accuracy10 = mean
                print('Coverage:{0:7.2%}  Accuracy:{1:7.2%}  Accuracy(Top 5):{2:7.2%}  Accuracy(Top 10):{3:7.2%}'.format(coverage, accuracy, accuracy5, accuracy10))
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(
        description=
        'Evaluate embeddings of two languages in a shared space in word translation induction'
    )
    parser.add_argument('src_embeddings',
                        help='the source language embeddings')
    parser.add_argument('trg_embeddings',
                        help='the target language embeddings')
    parser.add_argument('-d',
                        '--dictionary',
                        default=sys.stdin.fileno(),
                        help='the test dictionary file (defaults to stdin)')
    parser.add_argument(
        '--encoding',
        default='utf-8',
        action='store_true',
        help='the character encoding for input/output (defaults to utf-8)')
    args = parser.parse_args()

    # Read input embeddings
    srcfile = open(args.src_embeddings,
                   encoding=args.encoding,
                   errors='surrogateescape')
    trgfile = open(args.trg_embeddings,
                   encoding=args.encoding,
                   errors='surrogateescape')
    src_words, src_matrix = embeddings.read(srcfile)
    trg_words, trg_matrix = embeddings.read(trgfile)

    # Length normalize embeddings so their dot product effectively computes the cosine similarity
    src_matrix = embeddings.length_normalize(src_matrix)
    trg_matrix = embeddings.length_normalize(trg_matrix)

    # Build word to index map
    src_word2ind = {word: i for i, word in enumerate(src_words)}
    trg_word2ind = {word: i for i, word in enumerate(trg_words)}

    # Read dictionary and compute coverage
    f = open(args.dictionary, encoding=args.encoding, errors='surrogateescape')
    src2trg = collections.defaultdict(set)
    oov = set()
    vocab = set()
    for line in f:
        src, trg = line.split()
        try:
            src_ind = src_word2ind[src]
            trg_ind = trg_word2ind[trg]
            src2trg[src_ind].add(trg_ind)
            vocab.add(src)
        except KeyError:
            oov.add(src)
    oov -= vocab  # If one of the translation options is in the vocabulary, then the entry is not an oov
    coverage = len(src2trg) / (len(src2trg) + len(oov))

    # Compute accuracy
    correct = 0
    src, trg = zip(*src2trg.items())
    for i in range(0, len(src2trg), BATCH_SIZE):
        j = min(i + BATCH_SIZE, len(src2trg))
        similarities = src_matrix[list(src[i:j])].dot(trg_matrix.T)
        nn = np.argmax(similarities, axis=1).tolist()
        for k in range(j - i):
            if nn[k] in trg[i + k]:
                correct += 1
    print('Coverage:{0:7.2%}  Accuracy:{1:7.2%}'.format(
        coverage, correct / len(src2trg)))
示例#24
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(description='Evaluate embeddings in word analogy')
    parser.add_argument('--src_embeddings', help='the word embeddings for source (left side)')
    parser.add_argument('--trg_embeddings', help='the word embeddings for target (right side)')
    parser.add_argument('-t', '--threshold', type=int, default=0, help='reduce vocabulary of the model for fast approximate evaluation (0 = off, otherwise typical value is 30,000)')
    parser.add_argument('-i', '--input', default=sys.stdin.fileno(), help='the test file (defaults to stdin)')
    parser.add_argument('-v', '--verbose', action='store_true', help='verbose output (give category specific results)')
    parser.add_argument('-l1', '--src_lowercase', action='store_true', help='lowercase the words in the test file')
    parser.add_argument('-l2', '--trg_lowercase', action='store_true', help='lowercase the words in the test file')    
    parser.add_argument('--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)')
    parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp32', help='the floating-point precision (defaults to fp32)')
    args = parser.parse_args()

    # Choose the right dtype for the desired precision
    if args.precision == 'fp16':
        dtype = 'float16'
    elif args.precision == 'fp32':
        dtype = 'float32'
    elif args.precision == 'fp64':
        dtype = 'float64'

    # Read input embeddings
    f = open(args.src_embeddings, encoding=args.encoding, errors='surrogateescape')
    src_words, src_matrix = embeddings.read(f, threshold=args.threshold, dtype=dtype)
    f.close()
    f = open(args.trg_embeddings, encoding=args.encoding, errors='surrogateescape')
    trg_words, trg_matrix = embeddings.read(f, threshold=args.threshold, dtype=dtype)
    f.close()
    # Build word to index map
    src_word2ind = {word: i for i, word in enumerate(src_words)}
    trg_word2ind = {word: i for i, word in enumerate(trg_words)}
    src_ind2word = {i: word for i, word in enumerate(src_words)}
    trg_ind2word = {i: word for i, word in enumerate(trg_words)}
    
    # Length normalize embeddings
    embeddings.length_normalize(src_matrix)
    embeddings.length_normalize(trg_matrix)
    
    # Parse test file
    # c-a+b ~ d
    f = open(args.input, encoding=args.encoding, errors='surrogateescape')
    categories = []
    a = [] #src lang
    b = [] #src lang
    c = [] #trg lang
    d = [] #trg lang
    linecounter = 0
    for line in f:
        if line.startswith(': '):
            name = line[2:-1]
            is_syntactic = name.startswith('gram')
            categories.append({'name': name, 'is_syntactic': is_syntactic, 'total': 0, 'oov': 0})
        else:
            try:
                words = line.split()
                #ind = [word2ind[word.lower() if args.lowercase else word] for word in line.split()]

                w0 = src_word2ind[words[0].lower() if args.src_lowercase else words[0]]
                w1 = src_word2ind[words[1].lower() if args.src_lowercase else words[1]]
                w2 = trg_word2ind[words[2].lower() if args.trg_lowercase else words[2]]
                w3 = trg_word2ind[words[3].lower() if args.trg_lowercase else words[3]]

                a.append(w0)
                b.append(w1)
                c.append(w2)
                d.append(w3)
                
                categories[-1]['total'] += 1
            except KeyError:
                categories[-1]['oov'] += 1
    total = len(a)

    # Compute nearest neighbors using efficient matrix multiplication
    nn = []
    for i in range(0, total, BATCH_SIZE):
        j = min(i + BATCH_SIZE, total)
        similarities = (trg_matrix[c[i:j]] - src_matrix[a[i:j]] + src_matrix[b[i:j]]).dot(trg_matrix.T)
        similarities[range(j-i), a[i:j]] = -1
        similarities[range(j-i), b[i:j]] = -1
        similarities[range(j-i), c[i:j]] = -1
        nn += np.argmax(similarities, axis=1).tolist()
    nn = np.array(nn)

    # Compute and print accuracies
    semantic = {'correct': 0, 'total': 0, 'oov': 0}
    syntactic = {'correct': 0, 'total': 0, 'oov': 0}
    ind = 0
    with open('crosslingual_predict.txt', 'w') as outfile:
        for i in range(len(nn)):
            outfile.write(src_ind2word[a[i]]+' '+src_ind2word[b[i]]+' '+trg_ind2word[c[i]]+' '+trg_ind2word[d[i]]+' | '+trg_ind2word[nn[i]]+'\n')
    for category in categories:
        current = syntactic if category['is_syntactic'] else semantic
        correct = np.sum(nn[ind:ind+category['total']] == d[ind:ind+category['total']])
        current['correct'] += correct
        current['total'] += category['total']
        current['oov'] += category['oov']
        ind += category['total']
        if args.verbose:
            print('Coverage:{0:7.2%}  Accuracy:{1:7.2%} | {2}'.format(
                category['total'] / (category['total'] + category['oov']),
                correct / category['total'],
                category['name']))
    if args.verbose:
        print('-'*80)
    print('Coverage:{0:7.2%}  Accuracy:{1:7.2%} (sem:{2:7.2%}, syn:{3:7.2%})'.format(
        (semantic['total'] + syntactic['total']) / (semantic['total'] + syntactic['total'] + semantic['oov'] + syntactic['oov']),
        (semantic['correct'] + syntactic['correct']) / (semantic['total'] + syntactic['total']),
        semantic['correct'] / semantic['total'],
        syntactic['correct'] / syntactic['total']))