示例#1
0
    def __init__(self, dataPtr, lambda1=1e-2, rank=10):
        """ Initialize parameters

        Args:
            dataPtr (DataPtr): An object of which contains X, Z side features and target matrix Y.
            lambda1 (uint): Regularizer.
            rank (uint): rank of the U, B, V parametrization.
        """

        self.dataset = dataPtr
        self.X = self.dataset.get_entity("row")
        self.Z = self.dataset.get_entity("col")
        self.rank = rank
        self._loadTarget()
        self.shape = (self.X.shape[0], self.Z.shape[0])
        self.lambda1 = lambda1
        self.nSamples = self.Y.data.shape[0]

        self.W = None
        self.optima_reached = False
        self.manifold = Product([
            Stiefel(self.X.shape[1], self.rank),
            PositiveDefinite(self.rank),
            Stiefel(self.Z.shape[1], self.rank)
        ])
示例#2
0
def test_inferer_infer(dataPtr):
    test_data = dataPtr

    rowFeatureDim = test_data.get_entity("row").shape[1]
    colFeatureDim = test_data.get_entity("col").shape[1]
    rank = 2
    W = [
        Stiefel(rowFeatureDim, rank).rand(),
        PositiveDefinite(rank).rand(),
        Stiefel(colFeatureDim, rank).rand(),
    ]

    Inferer(method="dot").infer(test_data, W)

    inference = Inferer(method="dot",
                        transformation="mean").infer(test_data, W)
    nOccurences = collections.Counter(inference.ravel())
    assert nOccurences[0] + nOccurences[1] == inference.size

    k = 2
    inference = Inferer(method="dot", k=k,
                        transformation="topk").infer(test_data, W)
    nOccurences = collections.Counter(inference.ravel())
    assert nOccurences[0] + nOccurences[1] == inference.size
    assert np.max(np.count_nonzero(inference == 1, axis=0)) <= k
示例#3
0
    def fit(self, T, Y, init, maxIter=100):
        self.init_fit(T, Y, None)

        D = self.D + self.L
        K = self.K

        # (1) Instantiate the manifold
        manifold = Product([PositiveDefinite(D + 1, k=K), Euclidean(K - 1)])

        cost = self.get_cost_function(T, Y)

        problem = Problem(manifold=manifold, cost=cost, verbosity=1)

        # (3) Instantiate a Pymanopt solver
        solver = SteepestDescent(maxiter=3 * maxIter)

        # let Pymanopt do the rest
        Xopt = solver.solve(problem)
        self.Xopt_to_theta(Xopt)
示例#4
0
    def __init__(self, Xs, Xt, A, lbda, rank, device=-1):
        self.Xs = Xs
        self.Xt = Xt
        self.A = A
        self.rank = rank
        self.lbda = lbda
        assert isinstance(self.Xs, torch.Tensor)
        assert isinstance(self.Xt, torch.Tensor)
        assert isinstance(self.A, torch.Tensor)
        self.device = device

        d1 = self.Xs.size(1)
        d2 = self.Xt.size(1)

        assert (d1 == rank == d2), f"Found dimensions {d1}, {rank}, {d2}"
        d = d1
        self.manifold = Product(
            [Stiefel(d, d), PositiveDefinite(d),
             Stiefel(d, d)])
示例#5
0
def main():
    r"""Main entry point in the graph embedding procedure."""
    args = config_parser().parse_args()

    g_pdists = load_pdists(args)
    n = g_pdists.shape[0]
    d = args.manifold_dim

    # we are actually using only the upper diagonal part
    g_pdists = g_pdists[np.triu_indices(n, 1)]
    g_sq_pdists = g_pdists**2

    # read the graph
    # the distortion cost
    def distortion_cost(X):
        man_sq_pdists = manifold_pdists(X, squared=True)

        return np.sum(np.abs(man_sq_pdists / g_sq_pdists - 1))

    # the manifold, problem, and solver
    manifold = PositiveDefinite(d, k=n)
    problem = Problem(manifold=manifold, cost=distortion_cost, verbosity=2)
    linesearch = ReduceLROnPlateau(start_lr=2e-2,
                                   patience=10,
                                   threshold=1e-4,
                                   factor=0.1,
                                   verbose=1)
    solver = ConjugateGradient(linesearch=linesearch, maxiter=1000)

    # solve it
    with Timer('training') as t:
        X_opt = solver.solve(problem, x=sample_init_points(n, d))

    # the distortion achieved
    man_pdists = manifold_pdists(X_opt)
    print('Average distortion: ', average_distortion(g_pdists, man_pdists))
    man_pdists_sym = pdists_vec_to_sym(man_pdists, n, 1e12)
    print('MAP: ', mean_average_precision(g,
                                          man_pdists_sym,
                                          diag_adjusted=True))
# number of points, number of total features, number of relevant features
n, p, d = 50, 10, 5
# regularization parameter for L_1 cost
lam1 = 0.05
# regularization parameter for L_{1,2} cost
lam12 = 0.05
# n p-dimensional feature vectors
X = features(n, p)
# The true Kernel, a p by p symmetric PSD kernel with d^2 non-zero entries
Ktrue = kernel(p, d)
print('Ktrue shape', np.shape(Ktrue))
pulls = 5000  # number of triplets gathered
S = triplets(Ktrue, X, pulls, noise=True)  # get some triplets

# Instantiate a manifold
manifold = PositiveDefinite(p)
# Define the cost function for the L_1 and L_{1,2} problems (here using
# autograd.numpy)


@primitive
def norm1(x):
    return np.sum(np.abs(x))


def make_grad_norm1(ans, x):
    def gradient_product(g):
        s = np.sign(x)
        return np.full(x.shape, g) * s

    return gradient_product
 def setUp(self):
     self.n = n = 15
     self.man = PositiveDefinite(n)
 def setUp(self):
     self.n = n = 10
     self.k = k = 3
     self.man = PositiveDefinite(n, k)
示例#9
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(description='Map the source embeddings into the target embedding space')
    parser.add_argument('src_input', help='the input source embeddings')
    parser.add_argument('trg_input', help='the input target embeddings')
    parser.add_argument('--model_path', default=None, type=str, help='directory to save the model')
    parser.add_argument('--geomm_embeddings_path', default=None, type=str, help='directory to save the output GeoMM latent space embeddings. The output embeddings are normalized.')
    parser.add_argument('--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)')
    parser.add_argument('--max_vocab', default=0,type=int, help='Maximum vocabulary to be loaded, 0 allows complete vocabulary')
    parser.add_argument('--verbose', default=0,type=int, help='Verbose')
    mapping_group = parser.add_argument_group('mapping arguments', 'Basic embedding mapping arguments')
    mapping_group.add_argument('-dtrain', '--dictionary_train', default=sys.stdin.fileno(), help='the training dictionary file (defaults to stdin)')
    mapping_group.add_argument('-dtest', '--dictionary_test', default=sys.stdin.fileno(), help='the test dictionary file (defaults to stdin)')
    mapping_group.add_argument('--normalize', choices=['unit', 'center', 'unitdim', 'centeremb'], nargs='*', default=[], help='the normalization actions to perform in order')

    geomm_group = parser.add_argument_group('GeoMM arguments', 'Arguments for GeoMM method')
    geomm_group.add_argument('--l2_reg', type=float,default=1e2, help='Lambda for L2 Regularization')
    geomm_group.add_argument('--max_opt_time', type=int,default=5000, help='Maximum time limit for optimization in seconds')
    geomm_group.add_argument('--max_opt_iter', type=int,default=150, help='Maximum number of iterations for optimization')

    eval_group = parser.add_argument_group('evaluation arguments', 'Arguments for evaluation')
    eval_group.add_argument('--normalize_eval', action='store_true', help='Normalize the embeddings at test time')
    eval_group.add_argument('--eval_batch_size', type=int,default=1000, help='Batch size for evaluation')
    eval_group.add_argument('--csls_neighbourhood', type=int,default=10, help='Neighbourhood size for CSLS')

    args = parser.parse_args()
    BATCH_SIZE = args.eval_batch_size

    ## Logging
    #method_name = os.path.join('logs','geomm')
    #directory = os.path.join(os.path.join(os.getcwd(),method_name), datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
    #if not os.path.exists(directory):
    #    os.makedirs(directory)
    #log_file_name, file_extension = os.path.splitext(os.path.basename(args.dictionary_train))
    #log_file_name = log_file_name + '.log'
    #class Logger(object):
    #    def __init__(self):
    #        self.terminal = sys.stdout
    #        self.log = open(os.path.join(directory,log_file_name), "a")

    #    def write(self, message):
    #        self.terminal.write(message)
    #        self.log.write(message)

    #    def flush(self):
    #        #this flush method is needed for python 3 compatibility.
    #        #this handles the flush command by doing nothing.
    #        #you might want to specify some extra behavior here.
    #        pass
    #sys.stdout = Logger()
    if args.verbose:
        print('Current arguments: {0}'.format(args))

    dtype = 'float32'
    if args.verbose:
        print('Loading train data...')
    # Read input embeddings
    srcfile = open(args.src_input, encoding=args.encoding, errors='surrogateescape')
    trgfile = open(args.trg_input, encoding=args.encoding, errors='surrogateescape')
    src_words, x = embeddings.read(srcfile,max_voc=args.max_vocab, dtype=dtype)
    trg_words, z = embeddings.read(trgfile,max_voc=args.max_vocab, dtype=dtype)

    # Build word to index map
    src_word2ind = {word: i for i, word in enumerate(src_words)}
    trg_word2ind = {word: i for i, word in enumerate(trg_words)}

    # Build training dictionary
    noov=0
    src_indices = []
    trg_indices = []
    f = open(args.dictionary_train, encoding=args.encoding, errors='surrogateescape')
    for line in f:
        src,trg = line.split()
        if args.max_vocab:
            src=src.lower()
            trg=trg.lower()
        try:
            src_ind = src_word2ind[src]
            trg_ind = trg_word2ind[trg]
            src_indices.append(src_ind)
            trg_indices.append(trg_ind)
        except KeyError:
            noov+=1
            if args.verbose:
                print('WARNING: OOV dictionary entry ({0} - {1})'.format(src, trg)) #, file=sys.stderr
    f.close()
    if args.verbose:
        print('Number of training pairs having at least one OOV: {}'.format(noov))
    src_indices = src_indices
    trg_indices = trg_indices
    if args.verbose:
        print('Normalizing embeddings...')
    # STEP 0: Normalization
    for action in args.normalize:
        if action == 'unit':
            x = embeddings.length_normalize(x)
            z = embeddings.length_normalize(z)
        elif action == 'center':
            x = embeddings.mean_center(x)
            z = embeddings.mean_center(z)
        elif action == 'unitdim':
            x = embeddings.length_normalize_dimensionwise(x)
            z = embeddings.length_normalize_dimensionwise(z)
        elif action == 'centeremb':
            x = embeddings.mean_center_embeddingwise(x)
            z = embeddings.mean_center_embeddingwise(z)


    # Step 1: Optimization
    if args.verbose:
        print('Beginning Optimization')
    start_time = time.time()
    x_count = len(set(src_indices))
    z_count = len(set(trg_indices))
    A = np.zeros((x_count,z_count))

    # Creating dictionary matrix from training set
    map_dict_src={}
    map_dict_trg={}
    I=0
    uniq_src=[]
    uniq_trg=[]
    for i in range(len(src_indices)):
        if src_indices[i] not in map_dict_src.keys():
            map_dict_src[src_indices[i]]=I
            I+=1
            uniq_src.append(src_indices[i])
    J=0
    for j in range(len(trg_indices)):
        if trg_indices[j] not in map_dict_trg.keys():
            map_dict_trg[trg_indices[j]]=J
            J+=1
            uniq_trg.append(trg_indices[j])

    for i in range(len(src_indices)):
        A[map_dict_src[src_indices[i]],map_dict_trg[trg_indices[i]]]=1

    np.random.seed(0)
    Lambda=args.l2_reg

    U1 = TT.matrix()
    U2 = TT.matrix()
    B  = TT.matrix()

    Kx, Kz = x[uniq_src], z[uniq_trg]
    XtAZ = Kx.T.dot(A.dot(Kz))
    XtX = Kx.T.dot(Kx)
    ZtZ = Kz.T.dot(Kz)
    # AA = np.sum(A*A) # this can be added if cost needs to be compared to original geomm

    W = (U1.dot(B)).dot(U2.T)
    regularizer = 0.5*Lambda*(TT.sum(B**2))
    sXtX = shared(XtX)
    sZtZ = shared(ZtZ)
    sXtAZ = shared(XtAZ)

    cost = regularizer
    wtxtxw = W.T.dot(sXtX.dot(W))
    wtxtxwztz = wtxtxw.dot(sZtZ)
    cost += TT.nlinalg.trace(wtxtxwztz)
    cost += -2 * TT.sum(W * sXtAZ)
    # cost += shared(AA) # this can be added if cost needs to be compared with original geomm

    solver = ConjugateGradient(maxtime=args.max_opt_time,maxiter=args.max_opt_iter)

    manifold =Product([Stiefel(x.shape[1], x.shape[1]),Stiefel(z.shape[1], x.shape[1]),PositiveDefinite(x.shape[1])])
    #manifold =Product([Stiefel(x.shape[1], 200),Stiefel(z.shape[1], 200),PositiveDefinite(200)])
    problem = Problem(manifold=manifold, cost=cost, arg=[U1,U2,B], verbosity=3)
    wopt = solver.solve(problem)

    w= wopt
    U1 = w[0]
    U2 = w[1]
    B = w[2]

    ### Save the models if requested
    if args.model_path is not None:
        os.makedirs(args.model_path,exist_ok=True)
        np.savetxt('{}/U_src.csv'.format(args.model_path),U1)
        np.savetxt('{}/U_tgt.csv'.format(args.model_path),U2)
        np.savetxt('{}/B.csv'.format(args.model_path),B)

    # Step 2: Transformation
    xw = x.dot(U1).dot(scipy.linalg.sqrtm(B))
    zw = z.dot(U2).dot(scipy.linalg.sqrtm(B))

    end_time = time.time()
    if args.verbose:
        print('Completed training in {0:.2f} seconds'.format(end_time-start_time))
    gc.collect()

    ### Save the GeoMM embeddings if requested
    xw_n = embeddings.length_normalize(xw)
    zw_n = embeddings.length_normalize(zw)
    if args.geomm_embeddings_path is not None:
        os.makedirs(args.geomm_embeddings_path,exist_ok=True)

        out_emb_fname=os.path.join(args.geomm_embeddings_path,'src.vec')
        with open(out_emb_fname,'w',encoding=args.encoding) as outfile:
            embeddings.write(src_words,xw_n,outfile)

        out_emb_fname=os.path.join(args.geomm_embeddings_path,'trg.vec')
        with open(out_emb_fname,'w',encoding=args.encoding) as outfile:
            embeddings.write(trg_words,zw_n,outfile)

    # Step 3: Evaluation
    if args.normalize_eval:
        xw = xw_n
        zw = zw_n

    X = xw[src_indices]
    Z = zw[trg_indices]

    # Loading test dictionary
    f = open(args.dictionary_test, encoding=args.encoding, errors='surrogateescape')
    src2trg = collections.defaultdict(set)
    trg2src = collections.defaultdict(set)
    oov = set()
    vocab = set()
    for line in f:
        src, trg = line.split()
        if args.max_vocab:
            src=src.lower()
            trg=trg.lower()
        try:
            src_ind = src_word2ind[src]
            trg_ind = trg_word2ind[trg]
            src2trg[src_ind].add(trg_ind)
            trg2src[trg_ind].add(src_ind)
            vocab.add(src)
        except KeyError:
            oov.add(src)
    src = list(src2trg.keys())
    trgt = list(trg2src.keys())

    oov -= vocab  # If one of the translation options is in the vocabulary, then the entry is not an oov
    coverage = len(src2trg) / (len(src2trg) + len(oov))
    f.close()

    translation = collections.defaultdict(int)
    translation5 = collections.defaultdict(list)
    translation10 = collections.defaultdict(list)

    ### compute nearest neigbours of x in z
    t=time.time()
    nbrhood_x=np.zeros(xw.shape[0])

    for i in range(0, len(src), BATCH_SIZE):
        j = min(i + BATCH_SIZE, len(src))
        similarities = xw[src[i:j]].dot(zw.T)
        similarities_x = -1*np.partition(-1*similarities,args.csls_neighbourhood-1 ,axis=1)
        nbrhood_x[src[i:j]]=np.mean(similarities_x[:,:args.csls_neighbourhood],axis=1)

    ### compute nearest neigbours of z in x (GPU version)
    nbrhood_z=np.zeros(zw.shape[0])
    with cp.cuda.Device(0):
        nbrhood_z2=cp.zeros(zw.shape[0])
        batch_num=1
        for i in range(0, zw.shape[0], BATCH_SIZE):
            j = min(i + BATCH_SIZE, zw.shape[0])
            similarities = -1*cp.partition(-1*cp.dot(cp.asarray(zw[i:j]),cp.transpose(cp.asarray(xw))),args.csls_neighbourhood-1 ,axis=1)[:,:args.csls_neighbourhood]
            nbrhood_z2[i:j]=(cp.mean(similarities[:,:args.csls_neighbourhood],axis=1))
            batch_num+=1
        nbrhood_z=cp.asnumpy(nbrhood_z2)

    #### compute nearest neigbours of z in x (CPU version)
    #nbrhood_z=np.zeros(zw.shape[0])
    #for i in range(0, len(zw.shape[0]), BATCH_SIZE):
    #    j = min(i + BATCH_SIZE, len(zw.shape[0]))
    #    similarities = zw[i:j].dot(xw.T)
    #    similarities_z = -1*np.partition(-1*similarities,args.csls_neighbourhood-1 ,axis=1)
    #    nbrhood_z[i:j]=np.mean(similarities_z[:,:args.csls_neighbourhood],axis=1)

    #### find translation
    #for i in range(0, len(src), BATCH_SIZE):
    #    j = min(i + BATCH_SIZE, len(src))
    #    similarities = xw[src[i:j]].dot(zw.T)
    #    similarities = np.transpose(np.transpose(2*similarities) - nbrhood_x[src[i:j]]) - nbrhood_z
    #    nn = similarities.argmax(axis=1).tolist()
    #    similarities = np.argsort((similarities),axis=1)

    #    nn5 = (similarities[:,-5:])
    #    nn10 = (similarities[:,-10:])
    #    for k in range(j-i):
    #        translation[src[i+k]] = nn[k]
    #        translation5[src[i+k]] = nn5[k]
    #        translation10[src[i+k]] = nn10[k]


    #if args.geomm_embeddings_path is not None:
    #    delim=','
    #    os.makedirs(args.geomm_embeddings_path,exist_ok=True)

    #    translations_fname=os.path.join(args.geomm_embeddings_path,'translations.csv')
    #    with open(translations_fname,'w',encoding=args.encoding) as translations_file:
    #        for src_id in src:
    #            src_word = src_words[src_id]
    #            all_trg_words = [ trg_words[trg_id] for trg_id in src2trg[src_id] ]
    #            trgout_words = [ trg_words[j] for j in translation10[src_id] ]
    #            ss = list(nn10[src_id,:])
    #
    #            p1 = ':'.join(all_trg_words)
    #            p2 = delim.join( [ '{}{}{}'.format(w,delim,s) for w,s in zip(trgout_words,ss) ] )
    #            translations_file.write( '{s}{delim}{p1}{delim}{p2}\n'.format(s=src_word, delim=delim, p1=p1, p2=p2) )

    ### find translation  (and write to file if output requested)
    delim=','
    translations_file =None
    if args.geomm_embeddings_path is not None:
        os.makedirs(args.geomm_embeddings_path,exist_ok=True)
        translations_fname=os.path.join(args.geomm_embeddings_path,'translations.csv')
        translations_file = open(translations_fname,'w',encoding=args.encoding)

    for i in range(0, len(src), BATCH_SIZE):
        j = min(i + BATCH_SIZE, len(src))
        similarities = xw[src[i:j]].dot(zw.T)
        similarities = np.transpose(np.transpose(2*similarities) - nbrhood_x[src[i:j]]) - nbrhood_z
        nn = similarities.argmax(axis=1).tolist()
        similarities = np.argsort((similarities),axis=1)

        nn5 = (similarities[:,-5:])
        nn10 = (similarities[:,-10:])
        for k in range(j-i):
            translation[src[i+k]] = nn[k]
            translation5[src[i+k]] = nn5[k]
            translation10[src[i+k]] = nn10[k]


            if args.geomm_embeddings_path is not None:
                src_id=src[i+k]
                src_word = src_words[src_id]
                all_trg_words = [ trg_words[trg_id] for trg_id in src2trg[src_id] ]
                trgout_words = [ trg_words[j] for j in translation10[src_id] ]
                #ss = list(nn10[src_id,:])

                p1 = ':'.join(all_trg_words)
                p2 = ':'.join(trgout_words)
                #p2 = delim.join( [ '{}{}{}'.format(w,delim,s) for w,s in zip(trgout_words,ss) ] )
                translations_file.write( '{s}{delim}{p1}{delim}{p2}\n'.format(s=src_word, p1=p1, p2=p2, delim=delim) )

    if args.geomm_embeddings_path is not None:
        translations_file.close()

    accuracy = np.mean([1 if translation[i] in src2trg[i] else 0 for i in src])
    mean=0
    for i in src:
        for k in translation5[i]:
            if k in src2trg[i]:
                mean+=1
                break

    mean/=len(src)
    accuracy5 = mean

    mean=0
    for i in src:
        for k in translation10[i]:
            if k in src2trg[i]:
                mean+=1
                break

    mean/=len(src)
    accuracy10 = mean
    message = src_input.split(".")[-2] + "-->" + trg_input.split(".")[-2] + ":"
        'Coverage:{0:7.2%}  Accuracy:{1:7.2%}'.format(coverage, accuracy)
示例#10
0
    if n_k > 0:
        samples[indices] = np.random.multivariate_normal(mu[k], Sigma[k], n_k)

# Plot the data
colors = ['r', 'g', 'b', 'c', 'm']
for i in range(K):
    indices = (i == components)
    plt.scatter(samples[indices, 0],
                samples[indices, 1],
                alpha=.4,
                color=colors[i % K])
plt.axis('equal')
plt.show()

# (1) Instantiate the manifold
manifold = Product([PositiveDefinite(D + 1, k=K), Euclidean(K - 1)])


# (2) Define cost function
# The parameters must be contained in a list theta.
def cost(theta):
    # Unpack parameters
    nu = np.concatenate([theta[1], [0]], axis=0)

    S = theta[0]
    logdetS = np.expand_dims(np.linalg.slogdet(S)[1], 1)
    y = np.concatenate([samples.T, np.ones((1, N))], axis=0)

    # Calculate log_q
    y = np.expand_dims(y, 0)
示例#11
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(
        description='Generate latent space embeddings')
    parser.add_argument('emb1', help='path to embedding 1')
    parser.add_argument('emb2', help='path to embedding 2')
    parser.add_argument(
        '--geomm_embeddings_path',
        default=None,
        type=str,
        help=
        'directory to save the output GeoMM latent space embeddings. The output embeddings are normalized.'
    )
    parser.add_argument(
        '--encoding',
        default='utf-8',
        help='the character encoding for input/output (defaults to utf-8)')
    parser.add_argument('--verbose', default=0, type=int, help='Verbose')
    mapping_group = parser.add_argument_group(
        'mapping arguments', 'Basic embedding mapping arguments')
    mapping_group.add_argument('--dictionary',
                               default=sys.stdin.fileno(),
                               help='the dictionary file (defaults to stdin)')
    mapping_group.add_argument(
        '--normalize',
        choices=['unit', 'center', 'unitdim', 'centeremb', 'no'],
        nargs=2,
        default=[],
        help=
        'the normalization actions performed in sequence for embeddings 1 and 2'
    )

    geomm_group = parser.add_argument_group('GeoMM arguments',
                                            'Arguments for GeoMM method')
    geomm_group.add_argument('--l2_reg',
                             type=float,
                             default=1e2,
                             help='Lambda for L2 Regularization')
    geomm_group.add_argument(
        '--max_opt_time',
        type=int,
        default=5000,
        help='Maximum time limit for optimization in seconds')
    geomm_group.add_argument(
        '--max_opt_iter',
        type=int,
        default=150,
        help='Maximum number of iterations for optimization')

    args = parser.parse_args()

    if args.verbose:
        print('Current arguments: {0}'.format(args))

    dtype = 'float32'
    if args.verbose:
        print('Loading embeddings data...')

    # Read input embeddings
    emb1file = open(args.emb1,
                    encoding=args.encoding,
                    errors='surrogateescape')
    emb2file = open(args.emb2,
                    encoding=args.encoding,
                    errors='surrogateescape')
    emb1_words, x = embeddings.read(emb1file, max_voc=0, dtype=dtype)
    emb2_words, z = embeddings.read(emb2file, max_voc=0, dtype=dtype)

    # Build word to index map
    emb1_word2ind = {word: i for i, word in enumerate(emb1_words)}
    emb2_word2ind = {word: i for i, word in enumerate(emb2_words)}

    noov = 0
    emb1_indices = []
    emb2_indices = []
    f = open(args.dictionary, encoding=args.encoding, errors='surrogateescape')
    for line in f:
        emb1, emb2 = line.split()
        try:
            emb1_ind = emb1_word2ind[emb1]
            emb2_ind = emb2_word2ind[emb2]
            emb1_indices.append(emb1_ind)
            emb2_indices.append(emb2_ind)
        except KeyError:
            noov += 1
            if args.verbose:
                print('WARNING: OOV dictionary entry ({0} - {1})'.format(
                    emb1, emb2))  #, file=sys.stderr
    f.close()
    if args.verbose:
        print('Number of embedding pairs having at least one OOV: {}'.format(
            noov))
    emb1_indices = emb1_indices
    emb2_indices = emb2_indices
    if args.verbose:
        print('Normalizing embeddings...')

    # STEP 0: Normalization
    if len(args.normalize) > 0:
        x = normalize_emb(x, args.normalize[0])
        z = normalize_emb(z, args.normalize[1])

    # Step 1: Optimization
    if args.verbose:
        print('Beginning Optimization')
    start_time = time.time()
    x_count = len(set(emb1_indices))
    z_count = len(set(emb2_indices))

    # Filter out uniq values
    map_dict_emb1 = {}
    map_dict_emb2 = {}
    I = 0
    uniq_emb1 = []
    uniq_emb2 = []
    for i in range(len(emb1_indices)):
        if emb1_indices[i] not in map_dict_emb1.keys():
            map_dict_emb1[emb1_indices[i]] = I
            I += 1
            uniq_emb1.append(emb1_indices[i])
    J = 0
    for j in range(len(emb2_indices)):
        if emb2_indices[j] not in map_dict_emb2.keys():
            map_dict_emb2[emb2_indices[j]] = J
            J += 1
            uniq_emb2.append(emb2_indices[j])

    # Creating dictionary matrix
    row = list(range(0, x_count))
    col = list(range(0, x_count))
    data = [1 for i in range(0, x_count)]
    print(f"Counts: {x_count}, {z_count}")
    A = coo_matrix((data, (row, col)), shape=(x_count, z_count))

    np.random.seed(0)
    Lambda = args.l2_reg

    U1 = TT.matrix()
    U2 = TT.matrix()
    B = TT.matrix()

    Xemb1 = x[uniq_emb1]
    Zemb2 = z[uniq_emb2]
    del x, z
    gc.collect()

    Kx, Kz = Xemb1, Zemb2
    XtAZ = Kx.T.dot(A.dot(Kz))
    XtX = Kx.T.dot(Kx)
    ZtZ = Kz.T.dot(Kz)
    AA = np.sum(A * A)

    W = (U1.dot(B)).dot(U2.T)
    regularizer = 0.5 * Lambda * (TT.sum(B**2))
    sXtX = shared(XtX)
    sZtZ = shared(ZtZ)
    sXtAZ = shared(XtAZ)

    cost = regularizer
    wtxtxw = W.T.dot(sXtX.dot(W))
    wtxtxwztz = wtxtxw.dot(sZtZ)
    cost += TT.nlinalg.trace(wtxtxwztz)
    cost += -2 * TT.sum(W * sXtAZ)
    cost += shared(AA)

    solver = ConjugateGradient(maxtime=args.max_opt_time,
                               maxiter=args.max_opt_iter)

    manifold = Product([
        Stiefel(Kx.shape[1], Kx.shape[1]),
        Stiefel(Kz.shape[1], Kz.shape[1]),
        PositiveDefinite(Kx.shape[1])
    ])
    problem = Problem(manifold=manifold,
                      cost=cost,
                      arg=[U1, U2, B],
                      verbosity=3)
    wopt = solver.solve(problem)
    print(f"Problem solved ...")

    w = wopt
    U1 = w[0]
    U2 = w[1]
    B = w[2]

    print(f"Model copied ...")

    gc.collect()

    # Step 2: Transformation
    xw = Kx.dot(U1).dot(scipy.linalg.sqrtm(B))
    zw = Kz.dot(U2).dot(scipy.linalg.sqrtm(B))
    print(f"Transformation done ...")

    end_time = time.time()
    if args.verbose:
        print('Completed training in {0:.2f} seconds'.format(end_time -
                                                             start_time))

    del Kx, Kz, B, U1, U2
    gc.collect()

    ### Save the GeoMM embeddings if requested
    xw_n = embeddings.length_normalize(xw)
    zw_n = embeddings.length_normalize(zw)

    del xw, zw
    gc.collect()

    if args.geomm_embeddings_path is not None:
        os.makedirs(args.geomm_embeddings_path, exist_ok=True)

        out_emb_fname = os.path.join(args.geomm_embeddings_path, 'emb1.vec')
        new_emb1_words = []
        for id in uniq_emb1:
            new_emb1_words.append(emb1_words[id])
        with open(out_emb_fname, 'w', encoding=args.encoding) as outfile:
            embeddings.write(new_emb1_words, xw_n, outfile)

        new_emb2_words = []
        for id in uniq_emb2:
            new_emb2_words.append(emb2_words[id])
        out_emb_fname = os.path.join(args.geomm_embeddings_path, 'emb2.vec')
        with open(out_emb_fname, 'w', encoding=args.encoding) as outfile:
            embeddings.write(new_emb2_words, zw_n, outfile)

    exit(0)
示例#12
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(description='Map the source embeddings into the target embedding space')
    parser.add_argument('emb_file', help='the input target embeddings')
    parser.add_argument('--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)')
    parser.add_argument('--max_vocab', default=0,type=int, help='Maximum vocabulary to be loaded, 0 allows complete vocabulary')
    parser.add_argument('--verbose', default=0,type=int, help='Verbose')
  
    mapping_group = parser.add_argument_group('mapping arguments', 'Basic embedding mapping arguments')
    mapping_group.add_argument('-dtrain_file', '--dictionary_train_file', default=sys.stdin.fileno(), help='the training dictionary file (defaults to stdin)')
    mapping_group.add_argument('-dtest_file', '--dictionary_test_file', default=sys.stdin.fileno(), help='the test dictionary file (defaults to stdin)')
    mapping_group.add_argument('--normalize', choices=['unit', 'center', 'unitdim', 'centeremb'], nargs='*', default=[], help='the normalization actions to perform in order')
    
    geomm_group = parser.add_argument_group('GeoMM Multi arguments', 'Arguments for GeoMM Multi method')
    geomm_group.add_argument('--l2_reg', type=float,default=1e3, help='Lambda for L2 Regularization')
    geomm_group.add_argument('--max_opt_time', type=int,default=5000, help='Maximum time limit for optimization in seconds')
    geomm_group.add_argument('--max_opt_iter', type=int,default=150, help='Maximum number of iterations for optimization')
   
    eval_group = parser.add_argument_group('evaluation arguments', 'Arguments for evaluation')
    eval_group.add_argument('--normalize_eval', action='store_true', help='Normalize the embeddings at test time')
    eval_group.add_argument('--eval_batch_size', type=int,default=1000, help='Batch size for evaluation')
    eval_group.add_argument('--csls_neighbourhood', type=int,default=10, help='Neighbourhood size for CSLS')

    args = parser.parse_args()
    BATCH_SIZE = args.eval_batch_size

    # Logging
    method_name = os.path.join('logs','geomm_multi')
    directory = os.path.join(os.path.join(os.getcwd(),method_name), datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
    if not os.path.exists(directory):
        os.makedirs(directory)
    log_file_name, file_extension = os.path.splitext(os.path.basename(args.dictionary_train_file))
    log_file_name = log_file_name + '.log'
    class Logger(object):
        def __init__(self):
            self.terminal = sys.stdout
            self.log = open(os.path.join(directory,log_file_name), "a")

        def write(self, message):
            self.terminal.write(message)
            self.log.write(message)  

        def flush(self):
            #this flush method is needed for python 3 compatibility.
            #this handles the flush command by doing nothing.
            #you might want to specify some extra behavior here.
            pass    
    sys.stdout = Logger()
    if args.verbose:
        print('Current arguments: {0}'.format(args))

    dtype = 'float32'

    if args.verbose:
        print('Loading train data...')
    words = []
    emb = []
    with open(args.emb_file, encoding=args.encoding, errors='surrogateescape') as f:
        for line in f:
            srcfile = open(line.strip(), encoding=args.encoding, errors='surrogateescape')
            words_temp, x_temp = embeddings.read(srcfile,max_voc=args.max_vocab, dtype=dtype)
            words.append(words_temp)
            emb.append(x_temp)


    # Build word to index map
    word2ind = []
    for lang in words:
        word2ind.append({word: i for i, word in enumerate(lang)})

    # Build training dictionary
    train_pairs = []
    with open(args.dictionary_train_file, encoding=args.encoding, errors='surrogateescape') as ff:
        for line in ff:
            vals = line.split(',')
            curr_dict=[int(vals[0].strip()),int(vals[1].strip())]
            src_indices = []
            trg_indices = []
            with open(vals[2].strip(), encoding=args.encoding, errors='surrogateescape') as f:
                for line in f:
                    src,trg = line.split()
                    if args.max_vocab:
                        src=src.lower()
                        trg=trg.lower()
                    try:
                        src_ind = word2ind[curr_dict[0]][src]
                        trg_ind = word2ind[curr_dict[1]][trg]
                        src_indices.append(src_ind)
                        trg_indices.append(trg_ind)
                    except KeyError:
                        if args.verbose:
                            print('WARNING: OOV dictionary entry ({0} - {1})'.format(src, trg), file=sys.stderr)
            curr_dict.append(src_indices)
            curr_dict.append(trg_indices)
            train_pairs.append(curr_dict)
    if args.verbose:
        print('Normalizing embeddings...')
    # Step 0: Normalization
    for action in args.normalize:
        if action == 'unit':
            for i in range(len(emb)):
                emb[i] = embeddings.length_normalize(emb[i])
        elif action == 'center':
            for i in range(len(emb)):
                emb[i] = embeddings.mean_center(emb[i])
        elif action == 'unitdim':
            for i in range(len(emb)):
                emb[i] = embeddings.length_normalize_dimensionwise(emb[i])
        elif action == 'centeremb':
            for i in range(len(emb)):
                emb[i] = embeddings.mean_center_embeddingwise(emb[i])


    # Step 1: Optimization
    if args.verbose:
        print('Beginning Optimization')
    start_time = time.time()
    mean_size=0
    for tp in range(len(train_pairs)):
        src_indices = train_pairs[tp][2]
        trg_indices = train_pairs[tp][3]
        x_count = len(set(src_indices))
        z_count = len(set(trg_indices))
        A = np.zeros((x_count,z_count))
        
        # Creating dictionary matrix from training set
        map_dict_src={}
        map_dict_trg={}
        I=0
        uniq_src=[]
        uniq_trg=[]
        for i in range(len(src_indices)):
            if src_indices[i] not in map_dict_src.keys():
                map_dict_src[src_indices[i]]=I
                I+=1
                uniq_src.append(src_indices[i])
        J=0
        for j in range(len(trg_indices)):
            if trg_indices[j] not in map_dict_trg.keys():
                map_dict_trg[trg_indices[j]]=J
                J+=1
                uniq_trg.append(trg_indices[j])

        for i in range(len(src_indices)):
            A[map_dict_src[src_indices[i]],map_dict_trg[trg_indices[i]]]=1
        train_pairs[tp].append(uniq_src)
        train_pairs[tp].append(uniq_trg)
        train_pairs[tp].append(A)
        mean_size+= (len(uniq_src)*len(uniq_trg))
    mean_size = mean_size/len(train_pairs)
    np.random.seed(0)
    Lambda=args.l2_reg

    variables=[]
    manif = []
    low_rank=emb[0].shape[1]
    for i in range(len(emb)):
        variables.append(TT.matrix())
        manif.append(Stiefel(emb[i].shape[1],low_rank))
    variables.append(TT.matrix())
    manif.append(PositiveDefinite(low_rank))
    B = variables[-1]
    cost = 0.5*Lambda*(TT.sum(B**2))
    for i in range(len(train_pairs)):
        x = emb[train_pairs[i][0]]
        z = emb[train_pairs[i][1]]
        U1 = variables[train_pairs[i][0]]
        U2 = variables[train_pairs[i][1]]
        cost = cost + TT.sum(((shared(x[train_pairs[i][4]]).dot(U1.dot(B.dot(U2.T)))).dot(shared(z[train_pairs[i][5]]).T)-shared(train_pairs[i][6]))**2)/float(len(train_pairs[i][2]))
    solver = ConjugateGradient(maxtime=args.max_opt_time,maxiter=args.max_opt_iter,mingradnorm=1e-12)
    manifold =Product(manif)
    problem = Problem(manifold=manifold, cost=cost, arg=variables, verbosity=3)
    wopt = solver.solve(problem)
    w= wopt
    U1 = w[0]
    U2 = w[1]
    B = w[2]

    # Step 2: Transformation
    Bhalf = scipy.linalg.sqrtm(wopt[-1])
    test_emb = []
    for i in range(len(emb)):
        test_emb.append(emb[i].dot(wopt[i]).dot(Bhalf))

    end_time = time.time()
    if args.verbose:
        print('Completed training in {0:.2f} seconds'.format(end_time-start_time))
    gc.collect()


    # Step 3: Evaluation
    if args.verbose:
        print('Beginning Evaluation')

    if args.normalize_eval:
        for i in range(len(test_emb)):
            test_emb[i] = embeddings.length_normalize(test_emb[i])

    # Loading test dictionary
    with open(args.dictionary_test_file, encoding=args.encoding, errors='surrogateescape') as ff:
        for line in ff:
            vals = line.split(',')
            curr_dict=[int(vals[0].strip()),int(vals[1].strip())]
            with open(vals[2].strip(), encoding=args.encoding, errors='surrogateescape') as f:
                src_word2ind = word2ind[curr_dict[0]]
                trg_word2ind = word2ind[curr_dict[1]]
                xw = test_emb[curr_dict[0]]
                zw = test_emb[curr_dict[1]]
                src2trg = collections.defaultdict(set)
                trg2src = collections.defaultdict(set)
                oov = set()
                vocab = set()
                for line in f:
                    src, trg = line.split()
                    if args.max_vocab:
                        src=src.lower()
                        trg=trg.lower()
                    try:
                        src_ind = src_word2ind[src]
                        trg_ind = trg_word2ind[trg]
                        src2trg[src_ind].add(trg_ind)
                        trg2src[trg_ind].add(src_ind)
                        vocab.add(src)
                    except KeyError:
                        oov.add(src)
                src = list(src2trg.keys())
                trgt = list(trg2src.keys())

                oov -= vocab  # If one of the translation options is in the vocabulary, then the entry is not an oov
                coverage = len(src2trg) / (len(src2trg) + len(oov))
                f.close()

                translation = collections.defaultdict(int)
                translation5 = collections.defaultdict(list)
                translation10 = collections.defaultdict(list)

                t=time.time()
                nbrhood_x=np.zeros(xw.shape[0])
                nbrhood_z=np.zeros(zw.shape[0])
                nbrhood_z2=cp.zeros(zw.shape[0])
                for i in range(0, len(src), BATCH_SIZE):
                    j = min(i + BATCH_SIZE, len(src))
                    similarities = xw[src[i:j]].dot(zw.T)
                    similarities_x = -1*np.partition(-1*similarities,args.csls_neighbourhood-1 ,axis=1)
                    nbrhood_x[src[i:j]]=np.mean(similarities_x[:,:args.csls_neighbourhood],axis=1)
                batch_num=1
                for i in range(0, zw.shape[0], BATCH_SIZE):
                    j = min(i + BATCH_SIZE, zw.shape[0])
                    similarities = -1*cp.partition(-1*cp.dot(cp.asarray(zw[i:j]),cp.transpose(cp.asarray(xw))),args.csls_neighbourhood-1 ,axis=1)[:,:args.csls_neighbourhood]
                    nbrhood_z2[i:j]=(cp.mean(similarities[:,:args.csls_neighbourhood],axis=1))
                    batch_num+=1
                nbrhood_z=cp.asnumpy(nbrhood_z2)
                for i in range(0, len(src), BATCH_SIZE):
                    j = min(i + BATCH_SIZE, len(src))
                    similarities = xw[src[i:j]].dot(zw.T)
                    similarities = np.transpose(np.transpose(2*similarities) - nbrhood_x[src[i:j]])- nbrhood_z
                    nn = similarities.argmax(axis=1).tolist()
                    similarities = np.argsort((similarities),axis=1)

                    nn5 = (similarities[:,-5:])
                    nn10 = (similarities[:,-10:])
                    for k in range(j-i):
                        translation[src[i+k]] = nn[k]
                        translation5[src[i+k]] = nn5[k]
                        translation10[src[i+k]] = nn10[k]
                accuracy = np.mean([1 if translation[i] in src2trg[i] else 0 for i in src])
                mean=0
                for i in src:
                    for k in translation5[i]:
                        if k in src2trg[i]:
                            mean+=1
                            break

                mean/=len(src)
                accuracy5 = mean

                mean=0
                for i in src:
                    for k in translation10[i]:
                        if k in src2trg[i]:
                            mean+=1
                            break

                mean/=len(src)
                accuracy10 = mean
                print('Coverage:{0:7.2%}  Accuracy:{1:7.2%}  Accuracy(Top 5):{2:7.2%}  Accuracy(Top 10):{3:7.2%}'.format(coverage, accuracy, accuracy5, accuracy10))
示例#13
0
    def fit(self, RLRMCdata, verbosity=0, _evaluate=False):
        """The underlying fit method for RLRMC

        Args:
            RLRMCdata (RLRMCdataset): the RLRMCdataset object.
            verbosity (int): verbosity of Pymanopt. Possible values are 0 (least verbose), 1, or 2 (most verbose). 
            _evaluate (bool): flag to compute the per iteration statistics in train (and validation) datasets.
        """
        # initialize the model
        W0 = self._init_train(RLRMCdata.train)
        self.user2id = RLRMCdata.user2id
        self.item2id = RLRMCdata.item2id
        self.id2user = RLRMCdata.id2user
        self.id2item = RLRMCdata.id2item

        # residual variable
        residual_global = np.zeros(RLRMCdata.train.data.shape, dtype=np.float64)

        ###################Riemannian first-order algorithm######################

        solver = ConjugateGradientMS(
            maxtime=self.max_time,
            maxiter=self.maxiter,
            linesearch=LineSearchBackTracking(),
        )  # , logverbosity=2)
        # construction of manifold
        manifold = Product(
            [
                Stiefel(self.model_param.get("num_row"), self.rank),
                Stiefel(self.model_param.get("num_col"), self.rank),
                PositiveDefinite(self.rank),
            ]
        )
        problem = Problem(
            manifold=manifold,
            cost=lambda x: self._cost(
                x,
                RLRMCdata.train.data,
                RLRMCdata.train.indices,
                RLRMCdata.train.indptr,
                residual_global,
            ),
            egrad=lambda z: self._egrad(
                z, RLRMCdata.train.indices, RLRMCdata.train.indptr, residual_global
            ),
            verbosity=verbosity,
        )

        if _evaluate:
            residual_validation_global = np.zeros(
                RLRMCdata.validation.data.shape, dtype=np.float64
            )
            Wopt, self.stats = solver.solve(
                problem,
                x=W0,
                compute_stats=lambda x, y, z: self._my_stats(
                    x,
                    y,
                    z,
                    residual_global,
                    RLRMCdata.validation.data,
                    RLRMCdata.validation.indices,
                    RLRMCdata.validation.indptr,
                    residual_validation_global,
                ),
            )
        else:
            Wopt, self.stats = solver.solve(problem, x=W0)
        self.L = np.dot(Wopt[0], Wopt[2])
        self.R = Wopt[1]
示例#14
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(
        description='Map the source embeddings into the target embedding space'
    )
    parser.add_argument('src_input', help='the input source embeddings')
    parser.add_argument('trg_input', help='the input target embeddings')
    parser.add_argument(
        '--encoding',
        default='utf-8',
        help='the character encoding for input/output (defaults to utf-8)')
    parser.add_argument(
        '--max_vocab',
        default=0,
        type=int,
        help='Maximum vocabulary to be loaded, 0 allows complete vocabulary')
    parser.add_argument('--verbose', default=0, type=int, help='Verbose')
    mapping_group = parser.add_argument_group(
        'mapping arguments', 'Basic embedding mapping arguments')
    mapping_group.add_argument(
        '-dtrain',
        '--dictionary_train',
        default=sys.stdin.fileno(),
        help='the training dictionary file (defaults to stdin)')
    mapping_group.add_argument(
        '-dtest',
        '--dictionary_test',
        default=sys.stdin.fileno(),
        help='the test dictionary file (defaults to stdin)')
    mapping_group.add_argument(
        '-dtrainspl',
        '--dictionary_trainspl',
        default=sys.stdin.fileno(),
        help='the training dictionary split file (defaults to stdin)')
    mapping_group.add_argument(
        '-dvalspl',
        '--dictionary_valspl',
        default=sys.stdin.fileno(),
        help='the validation dictionary split file (defaults to stdin)')
    mapping_group.add_argument(
        '--normalize',
        choices=['unit', 'center', 'unitdim', 'centeremb'],
        nargs='*',
        default=[],
        help='the normalization actions to perform in order')

    geomm_group = parser.add_argument_group('GeoMM arguments',
                                            'Arguments for GeoMM method')
    geomm_group.add_argument('--l2_reg',
                             type=float,
                             default=1e-1,
                             help='Lambda for L2 Regularization')
    geomm_group.add_argument(
        '--max_opt_time',
        type=int,
        default=5000,
        help='Maximum time limit for optimization in seconds')
    geomm_group.add_argument(
        '--max_opt_iter',
        type=int,
        default=150,
        help='Maximum number of iterations for optimization')
    geomm_group.add_argument(
        '--x_cutoff',
        type=int,
        default=25000,
        help='Vocabulary cutoff for first language for bootstrapping')
    geomm_group.add_argument(
        '--z_cutoff',
        type=int,
        default=25000,
        help='Vocabulary cutoff for second language for bootstrapping')
    geomm_group.add_argument(
        '--patience',
        type=int,
        default=1,
        help=
        'Number of iterations with a decrease in validation accuracy permissible during bootstrapping'
    )

    eval_group = parser.add_argument_group('evaluation arguments',
                                           'Arguments for evaluation')
    eval_group.add_argument('--normalize_eval',
                            action='store_true',
                            help='Normalize the embeddings at test time')
    eval_group.add_argument('--eval_batch_size',
                            type=int,
                            default=500,
                            help='Batch size for evaluation')
    eval_group.add_argument('--csls_neighbourhood',
                            type=int,
                            default=10,
                            help='Neighbourhood size for CSLS')

    args = parser.parse_args()
    BATCH_SIZE = args.eval_batch_size

    # Logging
    method_name = os.path.join('logs', 'geomm_semi')
    directory = os.path.join(
        os.path.join(os.getcwd(), method_name),
        datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
    if not os.path.exists(directory):
        os.makedirs(directory)
    log_file_name, file_extension = os.path.splitext(
        os.path.basename(args.dictionary_train))
    log_file_name = log_file_name + '.log'

    class Logger(object):
        def __init__(self):
            self.terminal = sys.stdout
            self.log = open(os.path.join(directory, log_file_name), "a")

        def write(self, message):
            self.terminal.write(message)
            self.log.write(message)

        def flush(self):
            #this flush method is needed for python 3 compatibility.
            #this handles the flush command by doing nothing.
            #you might want to specify some extra behavior here.
            pass

    sys.stdout = Logger()
    if args.verbose:
        print('Current arguments: {0}'.format(args))

    dtype = 'float32'

    if args.verbose:
        print('Loading train data...')
    # Read input embeddings
    srcfile = open(args.src_input,
                   encoding=args.encoding,
                   errors='surrogateescape')
    trgfile = open(args.trg_input,
                   encoding=args.encoding,
                   errors='surrogateescape')
    src_words, x = embeddings.read(srcfile,
                                   max_voc=args.max_vocab,
                                   dtype=dtype)
    trg_words, z = embeddings.read(trgfile,
                                   max_voc=args.max_vocab,
                                   dtype=dtype)

    # Build word to index map
    src_word2ind = {word: i for i, word in enumerate(src_words)}
    trg_word2ind = {word: i for i, word in enumerate(trg_words)}

    # Build training dictionary
    src_indices = []
    trg_indices = []
    f = open(args.dictionary_train,
             encoding=args.encoding,
             errors='surrogateescape')
    for line in f:
        src, trg = line.split()
        if args.max_vocab:
            src = src.lower()
            trg = trg.lower()
        try:
            src_ind = src_word2ind[src]
            trg_ind = trg_word2ind[trg]
            src_indices.append(src_ind)
            trg_indices.append(trg_ind)
        except KeyError:
            if args.verbose:
                print('WARNING: OOV dictionary entry ({0} - {1})'.format(
                    src, trg),
                      file=sys.stderr)
    f.close()
    src_indices = src_indices
    trg_indices = trg_indices
    src_indices_train = list(src_indices)
    trg_indices_train = list(trg_indices)
    src_indices = []
    trg_indices = []

    # Loading train-split dictionary
    f = open(args.dictionary_trainspl,
             encoding=args.encoding,
             errors='surrogateescape')
    for line in f:
        src, trg = line.split()
        if args.max_vocab:
            src = src.lower()
            trg = trg.lower()
        try:
            src_ind = src_word2ind[src]
            trg_ind = trg_word2ind[trg]
            src_indices.append(src_ind)
            trg_indices.append(trg_ind)
        except KeyError:
            if args.verbose:
                print('WARNING: OOV dictionary entry ({0} - {1})'.format(
                    src, trg),
                      file=sys.stderr)
    f.close()

    if args.verbose:
        print('Normalizing embeddings...')
    # STEP 0: Normalization
    for action in args.normalize:
        if action == 'unit':
            x = embeddings.length_normalize(x)
            z = embeddings.length_normalize(z)
        elif action == 'center':
            x = embeddings.mean_center(x)
            z = embeddings.mean_center(z)
        elif action == 'unitdim':
            x = embeddings.length_normalize_dimensionwise(x)
            z = embeddings.length_normalize_dimensionwise(z)
        elif action == 'centeremb':
            x = embeddings.mean_center_embeddingwise(x)
            z = embeddings.mean_center_embeddingwise(z)
    orig_src = src_indices
    orig_trg = trg_indices
    best_val_acc = 0
    best_add_src = []
    best_add_trg = []
    add_src = []
    add_trg = []

    if args.verbose:
        print('Beginning Optimization')
    start_time = time.time()
    it_count = 0
    drop_count = 0

    # Bootstrap loop
    while True:
        if args.verbose:
            print('Starting bootstrap iteration {0}'.format(it_count + 1))
        # Step 1.1: Optimization
        x_count = len(set(src_indices))
        z_count = len(set(trg_indices))

        # Creating dictionary matrix from training set
        map_dict_src = {}
        map_dict_trg = {}
        I = 0
        uniq_src = []
        uniq_trg = []
        for i in range(len(src_indices)):
            if src_indices[i] not in map_dict_src.keys():
                map_dict_src[src_indices[i]] = I
                I += 1
                uniq_src.append(src_indices[i])
        J = 0
        for j in range(len(trg_indices)):
            if trg_indices[j] not in map_dict_trg.keys():
                map_dict_trg[trg_indices[j]] = J
                J += 1
                uniq_trg.append(trg_indices[j])

        np.random.seed(0)
        Lambda = args.l2_reg
        U1 = TT.matrix()
        U2 = TT.matrix()
        B = TT.matrix()
        X_tot = x[uniq_src].T.dot(x[uniq_src])
        Z_tot = z[uniq_trg].T.dot(z[uniq_trg])
        W = U1.dot(B.dot(U2.T))
        cost = (TT.nlinalg.trace(
            U2.dot(
                B.dot(
                    U1.T.dot(
                        shared(X_tot).dot(
                            U1.dot(B.dot(U2.T.dot(shared(Z_tot))))))))) -
                2 * TT.sum(
                    (shared(x[src_indices]).dot(W)) * shared(z[trg_indices]))
                ) / (len(src_indices)) + 0.5 * Lambda * (TT.sum(B**2))
        solver = ConjugateGradient(maxtime=args.max_opt_time,
                                   maxiter=args.max_opt_iter,
                                   mingradnorm=1e-15)

        low_rank = 300
        manifold = Product([
            Stiefel(x.shape[1], low_rank),
            Stiefel(z.shape[1], low_rank),
            PositiveDefinite(low_rank)
        ])
        problem = Problem(manifold=manifold,
                          cost=cost,
                          arg=[U1, U2, B],
                          verbosity=3)
        wopt = solver.solve(problem)
        w = wopt
        U1 = w[0]
        U2 = w[1]
        B = w[2]

        # Step 1.2: Transformation
        xw = x.dot(U1).dot(scipy.linalg.sqrtm(B))
        zw = z.dot(U2).dot(scipy.linalg.sqrtm(B))

        it_count += 1

        # Step 1.3: Compute Validation Accuracy
        if args.normalize_eval:
            xw = embeddings.length_normalize(xw)
            zw = embeddings.length_normalize(zw)

        # Loading validation dictionary
        f = open(args.dictionary_valspl,
                 encoding=args.encoding,
                 errors='surrogateescape')
        src2trg = collections.defaultdict(set)
        trg2src = collections.defaultdict(set)
        oov = set()
        vocab = set()
        for line in f:
            src, trg = line.split()
            if args.max_vocab:
                src = src.lower()
                trg = trg.lower()
            try:
                src_ind = src_word2ind[src]
                trg_ind = trg_word2ind[trg]
                src2trg[src_ind].add(trg_ind)
                trg2src[trg_ind].add(src_ind)
                vocab.add(src)
            except KeyError:
                oov.add(src)
        src = list(src2trg.keys())
        trgt = list(trg2src.keys())

        oov -= vocab  # If one of the translation options is in the vocabulary, then the entry is not an oov
        coverage = len(src2trg) / (len(src2trg) + len(oov))
        f.close()

        translation = collections.defaultdict(int)
        translation5 = collections.defaultdict(list)
        translation10 = collections.defaultdict(list)

        t = time.time()
        nbrhood_x = cp.zeros(xw.shape[0])
        nbrhood_z = cp.zeros(zw.shape[0])
        for i in range(0, len(src), BATCH_SIZE):
            j = min(i + BATCH_SIZE, len(src))
            similarities = -1 * cp.partition(
                -1 *
                cp.dot(cp.asarray(xw[src[i:j]]), cp.transpose(cp.asarray(zw))),
                args.csls_neighbourhood - 1,
                axis=1)[:, :args.csls_neighbourhood]
            nbrhood_x[src[i:j]] = (cp.mean(similarities, axis=1))

        for i in range(0, zw.shape[0], BATCH_SIZE):
            j = min(i + BATCH_SIZE, zw.shape[0])
            similarities = -1 * cp.partition(
                -1 * cp.dot(cp.asarray(zw[i:j]), cp.transpose(cp.asarray(xw))),
                args.csls_neighbourhood - 1,
                axis=1)[:, :args.csls_neighbourhood]
            nbrhood_z[i:j] = (cp.mean(similarities, axis=1))

        for i in range(0, len(src), BATCH_SIZE):
            j = min(i + BATCH_SIZE, len(src))
            similarities = cp.transpose(
                cp.transpose(2 * cp.asarray(xw[src[i:j]]).dot(
                    cp.transpose(cp.asarray(zw)))) -
                nbrhood_x[src[i:j]]) - nbrhood_z
            nn = cp.argmax(similarities, axis=1).tolist()
            similarities = cp.argsort((similarities), axis=1)

            nn5 = (similarities[:, -5:])
            nn10 = (similarities[:, -10:])
            for k in range(j - i):
                translation[src[i + k]] = nn[k]
                translation5[src[i + k]] = nn5[k].tolist()
                translation10[src[i + k]] = nn10[k].tolist()
        accuracy = np.mean(
            [1 if translation[i] in src2trg[i] else 0 for i in src])
        mean = 0
        for i in src:
            for k in translation5[i]:
                if k in src2trg[i]:
                    mean += 1
                    break

        mean /= len(src)
        accuracy5 = mean

        mean = 0
        for i in src:
            for k in translation10[i]:
                if k in src2trg[i]:
                    mean += 1
                    break

        mean /= len(src)
        accuracy10 = mean
        drop_count += 1
        if accuracy > best_val_acc:
            if args.verbose:
                print('Improvement of {0}%  over best validation accuracy!'.
                      format((accuracy - best_val_acc) * 100))
            best_val_acc = accuracy
            best_add_src = list(add_src)
            best_add_trg = list(add_trg)
            drop_count = 0

        if args.verbose:
            print(
                'Val Set:- Coverage:{0:7.2%}  Accuracy:{1:7.2%}  Accuracy(Top 5):{2:7.2%}  Accuracy(Top 10):{3:7.2%}'
                .format(coverage, accuracy, accuracy5, accuracy10))
        if drop_count >= args.patience:
            if args.verbose:
                print('Training ended')
            break

        # Step 1.4: Dictionary Induction Stage (Bootstrap)
        # Consider x_cutoff and z_cutoff to be the vocabulary of the two languages(First k words of vocabulary are the most frequent words in the language(as per standard word embeddings)).
        # CSLS Inferencing will be performed on this vocabulary subset. Bidirectional bootstrapping is performed.
        # Dictionary entries for first "x_cutoff" words of Language-1 and for first "z-cutoff" words of Language-2 are inferred. Original training dictionary is also added.
        # Total dictionary size=x_cutoff+z_cutoff+size(train_set)
        if args.normalize_eval:
            xw = embeddings.length_normalize(xw)
            zw = embeddings.length_normalize(zw)

        x_vocab_size = min(xw.shape[0], args.x_cutoff)
        z_vocab_size = min(zw.shape[0], args.z_cutoff)
        t = time.time()
        nbrhood_x = cp.zeros(x_vocab_size)
        best_sim_x = cp.zeros(x_vocab_size)
        best_sim_x_csls = cp.zeros(x_vocab_size)
        nbrhood_z = cp.zeros(z_vocab_size)

        batch_num = 1
        for i in range(0, x_vocab_size, BATCH_SIZE):
            j = min(i + BATCH_SIZE, x_vocab_size)
            similarities = -1 * cp.partition(
                -1 * cp.dot(cp.asarray(xw[i:j]),
                            cp.transpose(cp.asarray(zw[:z_vocab_size]))),
                args.csls_neighbourhood - 1,
                axis=1)[:, :args.csls_neighbourhood]
            nbrhood_x[i:j] = (cp.mean(similarities, axis=1))
            best_sim_x[i:j] = (cp.max(similarities, axis=1))
            batch_num += 1

        batch_num = 1
        for i in range(0, z_vocab_size, BATCH_SIZE):
            j = min(i + BATCH_SIZE, z_vocab_size)
            similarities = -1 * cp.partition(
                -1 * cp.dot(cp.asarray(zw[i:j]),
                            cp.transpose(cp.asarray(xw[:x_vocab_size]))),
                args.csls_neighbourhood - 1,
                axis=1)[:, :args.csls_neighbourhood]
            nbrhood_z[i:j] = (cp.mean(similarities, axis=1))
            batch_num += 1

        src_indices = list(range(0, x_vocab_size))
        trg_indices = []
        batch_num = 1
        for i in range(0, x_vocab_size, BATCH_SIZE):
            j = min(i + BATCH_SIZE, x_vocab_size)
            similarities = cp.transpose(
                cp.transpose(2 * cp.asarray(xw[i:j]).dot(
                    cp.transpose(cp.asarray(zw[:z_vocab_size])))) -
                nbrhood_x[i:j]) - nbrhood_z
            nn = cp.argmax(similarities, axis=1).tolist()
            trg_indices.append(nn)
            batch_num += 1

        src_indices2 = []
        trg_indices2 = list(range(0, z_vocab_size))
        batch_num = 1
        for i in range(0, z_vocab_size, BATCH_SIZE):
            j = min(i + BATCH_SIZE, z_vocab_size)
            similarities = cp.transpose(
                cp.transpose(2 * cp.asarray(zw[i:j]).dot(
                    cp.transpose(cp.asarray(xw[:x_vocab_size])))) -
                nbrhood_z[i:j]) - nbrhood_x
            nn = cp.argmax(similarities, axis=1).tolist()
            src_indices2.append(nn)
            batch_num += 1
        trg_indices = [item for sublist in trg_indices for item in sublist]
        src_indices2 = [item for sublist in src_indices2 for item in sublist]

        add_src = list(src_indices + src_indices2)
        add_trg = list(trg_indices + trg_indices2)
        src_indices = src_indices + src_indices2 + orig_src
        trg_indices = trg_indices + trg_indices2 + orig_trg

    end_time = time.time()
    if args.verbose:
        print('Completed bootstrapping in {0:.2f} seconds'.format(end_time -
                                                                  start_time))

    # Step 2: Final Training with bootstrapped dictionary
    if args.verbose:
        print('Training final model')
    src_indices = best_add_src + src_indices_train
    trg_indices = best_add_trg + trg_indices_train
    x_count = len(set(src_indices))
    z_count = len(set(trg_indices))

    # Creating dictionary matrix from training set
    map_dict_src = {}
    map_dict_trg = {}
    I = 0
    uniq_src = []
    uniq_trg = []
    for i in range(len(src_indices)):
        if src_indices[i] not in map_dict_src.keys():
            map_dict_src[src_indices[i]] = I
            I += 1
            uniq_src.append(src_indices[i])
    J = 0
    for j in range(len(trg_indices)):
        if trg_indices[j] not in map_dict_trg.keys():
            map_dict_trg[trg_indices[j]] = J
            J += 1
            uniq_trg.append(trg_indices[j])

    np.random.seed(0)
    Lambda = args.l2_reg
    U1 = TT.matrix()
    U2 = TT.matrix()
    B = TT.matrix()
    X_tot = x[uniq_src].T.dot(x[uniq_src])
    Z_tot = z[uniq_trg].T.dot(z[uniq_trg])
    W = U1.dot(B.dot(U2.T))
    cost = (TT.nlinalg.trace(
        U2.dot(
            B.dot(
                U1.T.dot(
                    shared(X_tot).dot(U1.dot(B.dot(U2.T.dot(shared(Z_tot)))))))
        )) - 2 * TT.sum(
            (shared(x[src_indices]).dot(W)) * shared(z[trg_indices]))
            ) / len(src_indices) + 0.5 * Lambda * (TT.sum(B**2))
    solver = ConjugateGradient(maxtime=args.max_opt_time,
                               maxiter=args.max_opt_iter)

    low_rank = 300
    manifold = Product([
        Stiefel(x.shape[1], low_rank),
        Stiefel(z.shape[1], low_rank),
        PositiveDefinite(low_rank)
    ])
    problem = Problem(manifold=manifold,
                      cost=cost,
                      arg=[U1, U2, B],
                      verbosity=3)
    wopt = solver.solve(problem)

    w = wopt
    U1 = w[0]
    U2 = w[1]
    B = w[2]

    xw = x.dot(U1).dot(scipy.linalg.sqrtm(B))
    zw = z.dot(U2).dot(scipy.linalg.sqrtm(B))

    gc.collect()

    # Step 3: Evaluation
    if args.verbose:
        print('Beginning Evaluation')

    if args.normalize_eval:
        xw = embeddings.length_normalize(xw)
        zw = embeddings.length_normalize(zw)
    # Loading test dictionary
    f = open(args.dictionary_test,
             encoding=args.encoding,
             errors='surrogateescape')
    src2trg = collections.defaultdict(set)
    trg2src = collections.defaultdict(set)
    oov = set()
    vocab = set()
    for line in f:
        src, trg = line.split()
        if args.max_vocab:
            src = src.lower()
            trg = trg.lower()
        try:
            src_ind = src_word2ind[src]
            trg_ind = trg_word2ind[trg]
            src2trg[src_ind].add(trg_ind)
            trg2src[trg_ind].add(src_ind)
            vocab.add(src)
        except KeyError:
            oov.add(src)
    src = list(src2trg.keys())
    trgt = list(trg2src.keys())

    oov -= vocab  # If one of the translation options is in the vocabulary, then the entry is not an oov
    coverage = len(src2trg) / (len(src2trg) + len(oov))
    f.close()

    translation = collections.defaultdict(int)
    translation5 = collections.defaultdict(list)
    translation10 = collections.defaultdict(list)

    t = time.time()
    nbrhood_x = np.zeros(xw.shape[0])
    nbrhood_z = np.zeros(zw.shape[0])
    nbrhood_z2 = cp.zeros(zw.shape[0])
    for i in range(0, len(src), BATCH_SIZE):
        j = min(i + BATCH_SIZE, len(src))
        similarities = xw[src[i:j]].dot(zw.T)
        similarities_x = -1 * np.partition(
            -1 * similarities, args.csls_neighbourhood - 1, axis=1)
        nbrhood_x[src[i:j]] = np.mean(
            similarities_x[:, :args.csls_neighbourhood], axis=1)

    batch_num = 1
    for i in range(0, zw.shape[0], BATCH_SIZE):
        j = min(i + BATCH_SIZE, zw.shape[0])
        similarities = -1 * cp.partition(
            -1 * cp.dot(cp.asarray(zw[i:j]), cp.transpose(cp.asarray(xw))),
            args.csls_neighbourhood - 1,
            axis=1)[:, :args.csls_neighbourhood]
        nbrhood_z2[i:j] = (cp.mean(similarities, axis=1))
        batch_num += 1
    nbrhood_z = cp.asnumpy(nbrhood_z2)
    for i in range(0, len(src), BATCH_SIZE):
        j = min(i + BATCH_SIZE, len(src))
        similarities = xw[src[i:j]].dot(zw.T)
        similarities = np.transpose(
            np.transpose(2 * similarities) - nbrhood_x[src[i:j]]) - nbrhood_z
        nn = similarities.argmax(axis=1).tolist()
        similarities = np.argsort((similarities), axis=1)

        nn5 = (similarities[:, -5:])
        nn10 = (similarities[:, -10:])
        for k in range(j - i):
            translation[src[i + k]] = nn[k]
            translation5[src[i + k]] = nn5[k]
            translation10[src[i + k]] = nn10[k]
    accuracy = np.mean([1 if translation[i] in src2trg[i] else 0 for i in src])
    mean = 0
    for i in src:
        for k in translation5[i]:
            if k in src2trg[i]:
                mean += 1
                break

    mean /= len(src)
    accuracy5 = mean

    mean = 0
    for i in src:
        for k in translation10[i]:
            if k in src2trg[i]:
                mean += 1
                break

    mean /= len(src)
    accuracy10 = mean
    print(
        'Coverage:{0:7.2%}  Accuracy:{1:7.2%}  Accuracy(Top 5):{2:7.2%}  Accuracy(Top 10):{3:7.2%}'
        .format(coverage, accuracy, accuracy5, accuracy10))
示例#15
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(
        description='Map the source embeddings into the target embedding space'
    )
    parser.add_argument('src_input', help='the input source embeddings')
    parser.add_argument('mid_input', help='the input pivot embeddings')
    parser.add_argument('trg_input', help='the input target embeddings')
    parser.add_argument(
        '--encoding',
        default='utf-8',
        help='the character encoding for input/output (defaults to utf-8)')
    parser.add_argument(
        '--max_vocab',
        default=0,
        type=int,
        help='Maximum vocabulary to be loaded, 0 allows complete vocabulary')
    parser.add_argument('--verbose', default=0, type=int, help='Verbose')
    mapping_group = parser.add_argument_group(
        'mapping arguments', 'Basic embedding mapping arguments')
    mapping_group.add_argument(
        '-dtrain1',
        '--dictionary_train1',
        default=sys.stdin.fileno(),
        help='the first training dictionary file (defaults to stdin)')
    mapping_group.add_argument(
        '-dtrain2',
        '--dictionary_train2',
        default=sys.stdin.fileno(),
        help='the second training dictionary file (defaults to stdin)')
    mapping_group.add_argument(
        '-dtest',
        '--dictionary_test',
        default=sys.stdin.fileno(),
        help='the test dictionary file (defaults to stdin)')
    mapping_group.add_argument(
        '--normalize',
        choices=['unit', 'center', 'unitdim', 'centeremb'],
        nargs='*',
        default=[],
        help='the normalization actions to perform in order')

    geomm_group = parser.add_argument_group('GeoMM arguments',
                                            'Arguments for GeoMM method')
    geomm_group.add_argument('--l2_reg',
                             type=float,
                             default=1e2,
                             help='Lambda for L2 Regularization')
    geomm_group.add_argument(
        '--max_opt_time',
        type=int,
        default=5000,
        help='Maximum time limit for optimization in seconds')
    geomm_group.add_argument(
        '--max_opt_iter',
        type=int,
        default=150,
        help='Maximum number of iterations for optimization')

    eval_group = parser.add_argument_group('evaluation arguments',
                                           'Arguments for evaluation')
    eval_group.add_argument('--normalize_eval',
                            action='store_true',
                            help='Normalize the embeddings at test time')
    eval_group.add_argument('--eval_batch_size',
                            type=int,
                            default=1000,
                            help='Batch size for evaluation')
    eval_group.add_argument('--csls_neighbourhood',
                            type=int,
                            default=10,
                            help='Neighbourhood size for CSLS')

    args = parser.parse_args()
    BATCH_SIZE = args.eval_batch_size

    # Logging
    method_name = os.path.join('logs', 'geomm_cmp_pip')
    directory = os.path.join(
        os.path.join(os.getcwd(), method_name),
        datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
    if not os.path.exists(directory):
        os.makedirs(directory)
    log_file_name, file_extension = os.path.splitext(
        os.path.basename(args.dictionary_test))
    log_file_name = log_file_name + '.log'

    class Logger(object):
        def __init__(self):
            self.terminal = sys.stdout
            self.log = open(os.path.join(directory, log_file_name), "a")

        def write(self, message):
            self.terminal.write(message)
            self.log.write(message)

        def flush(self):
            #this flush method is needed for python 3 compatibility.
            #this handles the flush command by doing nothing.
            #you might want to specify some extra behavior here.
            pass

    sys.stdout = Logger()
    if args.verbose:
        print('Current arguments: {0}'.format(args))

    dtype = 'float32'

    if args.verbose:
        print('Loading train data...')
    # Read input embeddings
    srcfile = open(args.src_input,
                   encoding=args.encoding,
                   errors='surrogateescape')
    midfile = open(args.mid_input,
                   encoding=args.encoding,
                   errors='surrogateescape')
    trgfile = open(args.trg_input,
                   encoding=args.encoding,
                   errors='surrogateescape')

    src_words, x = embeddings.read(srcfile,
                                   max_voc=args.max_vocab,
                                   dtype=dtype)
    mid_words, y = embeddings.read(midfile,
                                   max_voc=args.max_vocab,
                                   dtype=dtype)
    trg_words, z = embeddings.read(trgfile,
                                   max_voc=args.max_vocab,
                                   dtype=dtype)

    # Build word to index map
    src_word2ind = {word: i for i, word in enumerate(src_words)}
    mid_word2ind = {word: i for i, word in enumerate(mid_words)}
    trg_word2ind = {word: i for i, word in enumerate(trg_words)}

    # Build training dictionary-1
    src_indices12 = []
    trg_indices12 = []
    f = open(args.dictionary_train1,
             encoding=args.encoding,
             errors='surrogateescape')
    for line in f:
        src, trg = line.split()
        if args.max_vocab:
            src = src.lower()
            trg = trg.lower()
        try:
            src_ind = src_word2ind[src]
            trg_ind = mid_word2ind[trg]
            src_indices12.append(src_ind)
            trg_indices12.append(trg_ind)
        except KeyError:
            if args.verbose:
                print('WARNING: OOV dictionary entry ({0} - {1})'.format(
                    src, trg),
                      file=sys.stderr)
    f.close()

    # Build training dictionary-2
    src_indices23 = []
    trg_indices23 = []
    f = open(args.dictionary_train2,
             encoding=args.encoding,
             errors='surrogateescape')
    for line in f:
        src, trg = line.split()
        if args.max_vocab:
            src = src.lower()
            trg = trg.lower()
        try:
            src_ind = mid_word2ind[src]
            trg_ind = trg_word2ind[trg]
            src_indices23.append(src_ind)
            trg_indices23.append(trg_ind)
        except KeyError:
            if args.verbose:
                print('WARNING: OOV dictionary entry ({0} - {1})'.format(
                    src, trg),
                      file=sys.stderr)
    f.close()

    if args.verbose:
        print('Normalizing embeddings...')
    # STEP 0: Normalization
    for action in args.normalize:
        if action == 'unit':
            x = embeddings.length_normalize(x)
            y = embeddings.length_normalize(y)
            z = embeddings.length_normalize(z)
        elif action == 'center':
            x = embeddings.mean_center(x)
            y = embeddings.mean_center(y)
            z = embeddings.mean_center(z)
        elif action == 'unitdim':
            x = embeddings.length_normalize_dimensionwise(x)
            y = embeddings.length_normalize_dimensionwise(y)
            z = embeddings.length_normalize_dimensionwise(z)
        elif action == 'centeremb':
            x = embeddings.mean_center_embeddingwise(x)
            y = embeddings.mean_center_embeddingwise(y)
            z = embeddings.mean_center_embeddingwise(z)

    # Step 1.1: Optimization-1
    if args.verbose:
        print('Beginning Optimization-1')
    start_time = time.time()

    x_count = len(set(src_indices12))
    y_count = len(set(trg_indices12))
    A = np.zeros((x_count, y_count))

    # Creating dictionary matrix from training set
    map_dict_src = {}
    map_dict_trg = {}
    I = 0
    uniq_src = []
    uniq_trg = []
    for i in range(len(src_indices12)):
        if src_indices12[i] not in map_dict_src.keys():
            map_dict_src[src_indices12[i]] = I
            I += 1
            uniq_src.append(src_indices12[i])
    J = 0
    for j in range(len(trg_indices12)):
        if trg_indices12[j] not in map_dict_trg.keys():
            map_dict_trg[trg_indices12[j]] = J
            J += 1
            uniq_trg.append(trg_indices12[j])

    for i in range(len(src_indices12)):
        A[map_dict_src[src_indices12[i]], map_dict_trg[trg_indices12[i]]] = 1

    np.random.seed(0)
    Lambda = args.l2_reg
    U1 = TT.matrix()
    U2 = TT.matrix()
    B = TT.matrix()
    cost = TT.sum(((shared(x[uniq_src]).dot(U1.dot(B.dot(U2.T)))).dot(
        shared(y[uniq_trg]).T) - A)**2) + 0.5 * Lambda * (TT.sum(B**2))

    solver = ConjugateGradient(maxtime=args.max_opt_time,
                               maxiter=args.max_opt_iter)

    low_rank = 300
    manifold = Product([
        Stiefel(x.shape[1], low_rank),
        Stiefel(y.shape[1], low_rank),
        PositiveDefinite(low_rank)
    ])
    problem = Problem(manifold=manifold,
                      cost=cost,
                      arg=[U1, U2, B],
                      verbosity=3)
    wopt = solver.solve(problem)

    w = wopt
    U1 = w[0]
    U2 = w[1]
    B = w[2]
    w12 = U1.dot(B).dot(U2.T)
    u11 = U1
    u21 = U2
    b1 = B

    # Step 1.2: Optimization-2
    if args.verbose:
        print('Beginning Optimization-2')
    y_count = len(set(src_indices23))
    z_count = len(set(trg_indices23))
    A = np.zeros((y_count, z_count))

    # Creating dictionary matrix from training set
    map_dict_src = {}
    map_dict_trg = {}
    I = 0
    uniq_src = []
    uniq_trg = []
    for i in range(len(src_indices23)):
        if src_indices23[i] not in map_dict_src.keys():
            map_dict_src[src_indices23[i]] = I
            I += 1
            uniq_src.append(src_indices23[i])
    J = 0
    for j in range(len(trg_indices23)):
        if trg_indices23[j] not in map_dict_trg.keys():
            map_dict_trg[trg_indices23[j]] = J
            J += 1
            uniq_trg.append(trg_indices23[j])

    for i in range(len(src_indices23)):
        A[map_dict_src[src_indices23[i]], map_dict_trg[trg_indices23[i]]] = 1

    np.random.seed(0)
    U1 = TT.matrix()
    U2 = TT.matrix()
    B = TT.matrix()
    cost = TT.sum(((shared(y[uniq_src]).dot(U1.dot(B.dot(U2.T)))).dot(
        shared(z[uniq_trg]).T) - A)**2) + 0.5 * Lambda * (TT.sum(B**2))
    solver = ConjugateGradient(maxtime=args.max_opt_time,
                               maxiter=args.max_opt_iter)

    low_rank = 300
    manifold = Product([
        Stiefel(y.shape[1], low_rank),
        Stiefel(z.shape[1], low_rank),
        PositiveDefinite(low_rank)
    ])
    problem = Problem(manifold=manifold,
                      cost=cost,
                      arg=[U1, U2, B],
                      verbosity=3)
    wopt = solver.solve(problem)

    w = wopt
    U1 = w[0]
    U2 = w[1]
    B = w[2]
    w23 = U1.dot(B).dot(U2.T)
    u22 = U1
    u32 = U2
    b2 = B

    # Step 2: Transformation
    w12_1 = u11.dot(scipy.linalg.sqrtm(b1))
    w12_2 = u21.dot(scipy.linalg.sqrtm(b1))
    w23_1 = u22.dot(scipy.linalg.sqrtm(b2))
    w23_2 = u32.dot(scipy.linalg.sqrtm(b2))

    end_time = time.time()
    if args.verbose:
        print('Completed training in {0:.2f} seconds'.format(end_time -
                                                             start_time))
    gc.collect()

    # Step 3: Evaluation
    # Loading test dictionary
    f = open(args.dictionary_test,
             encoding=args.encoding,
             errors='surrogateescape')
    src2trg = collections.defaultdict(set)
    trg2src = collections.defaultdict(set)
    oov = set()
    vocab = set()
    for line in f:
        src, trg = line.split()
        if args.max_vocab:
            src = src.lower()
            trg = trg.lower()
        try:
            src_ind = src_word2ind[src]
            trg_ind = trg_word2ind[trg]
            src2trg[src_ind].add(trg_ind)
            trg2src[trg_ind].add(src_ind)
            vocab.add(src)
        except KeyError:
            oov.add(src)
    src = list(src2trg.keys())
    trgt = list(trg2src.keys())

    oov -= vocab  # If one of the translation options is in the vocabulary, then the entry is not an oov
    coverage = len(src2trg) / (len(src2trg) + len(oov))
    f.close()

    # Composition (CMP)
    xw = x.dot(w12).dot(w23)
    zw = z
    if args.normalize_eval:
        xw = embeddings.length_normalize(xw)
        zw = embeddings.length_normalize(zw)

    translation = collections.defaultdict(int)
    translation5 = collections.defaultdict(list)
    translation10 = collections.defaultdict(list)

    t = time.time()
    nbrhood_x = np.zeros(xw.shape[0])
    nbrhood_z = np.zeros(zw.shape[0])
    nbrhood_z2 = cp.zeros(zw.shape[0])
    for i in range(0, len(src), BATCH_SIZE):
        j = min(i + BATCH_SIZE, len(src))
        similarities = xw[src[i:j]].dot(zw.T)
        similarities_x = -1 * np.partition(
            -1 * similarities, args.csls_neighbourhood - 1, axis=1)
        nbrhood_x[src[i:j]] = np.mean(
            similarities_x[:, :args.csls_neighbourhood], axis=1)

    batch_num = 1
    for i in range(0, zw.shape[0], BATCH_SIZE):
        j = min(i + BATCH_SIZE, zw.shape[0])
        similarities = -1 * cp.partition(
            -1 * cp.dot(cp.asarray(zw[i:j]), cp.transpose(cp.asarray(xw))),
            args.csls_neighbourhood - 1,
            axis=1)[:, :args.csls_neighbourhood]
        nbrhood_z2[i:j] = (cp.mean(similarities[:, :args.csls_neighbourhood],
                                   axis=1))
        batch_num += 1
    nbrhood_z = cp.asnumpy(nbrhood_z2)
    for i in range(0, len(src), BATCH_SIZE):
        j = min(i + BATCH_SIZE, len(src))
        similarities = xw[src[i:j]].dot(zw.T)
        similarities = np.transpose(
            np.transpose(2 * similarities) - nbrhood_x[src[i:j]]) - nbrhood_z
        nn = similarities.argmax(axis=1).tolist()
        similarities = np.argsort((similarities), axis=1)

        nn5 = (similarities[:, -5:])
        nn10 = (similarities[:, -10:])
        for k in range(j - i):
            translation[src[i + k]] = nn[k]
            translation5[src[i + k]] = nn5[k]
            translation10[src[i + k]] = nn10[k]
    accuracy = np.mean([1 if translation[i] in src2trg[i] else 0 for i in src])
    mean = 0
    for i in src:
        for k in translation5[i]:
            if k in src2trg[i]:
                mean += 1
                break

    mean /= len(src)
    accuracy5 = mean

    mean = 0
    for i in src:
        for k in translation10[i]:
            if k in src2trg[i]:
                mean += 1
                break

    mean /= len(src)
    accuracy10 = mean
    print(
        'CMP: Coverage:{0:7.2%}  Accuracy:{1:7.2%}  Accuracy(Top 5):{2:7.2%}  Accuracy(Top 10):{3:7.2%}'
        .format(coverage, accuracy, accuracy5, accuracy10))

    # Pipeline (PIP)
    xw = x.dot(w12_1)
    zw = y.dot(w12_2)
    if args.normalize_eval:
        xw = embeddings.length_normalize(xw)
        zw = embeddings.length_normalize(zw)

    translation12 = collections.defaultdict(int)
    # PIP-Stage 1
    t = time.time()
    nbrhood_x = np.zeros(xw.shape[0])
    nbrhood_z = np.zeros(zw.shape[0])
    nbrhood_z2 = cp.zeros(zw.shape[0])
    for i in range(0, len(src), BATCH_SIZE):
        j = min(i + BATCH_SIZE, len(src))
        similarities = xw[src[i:j]].dot(zw.T)
        similarities_x = -1 * np.partition(
            -1 * similarities, args.csls_neighbourhood - 1, axis=1)
        nbrhood_x[src[i:j]] = np.mean(
            similarities_x[:, :args.csls_neighbourhood], axis=1)

    batch_num = 1
    for i in range(0, zw.shape[0], BATCH_SIZE):
        j = min(i + BATCH_SIZE, zw.shape[0])
        similarities = -1 * cp.partition(
            -1 * cp.dot(cp.asarray(zw[i:j]), cp.transpose(cp.asarray(xw))),
            args.csls_neighbourhood - 1,
            axis=1)[:, :args.csls_neighbourhood]
        nbrhood_z2[i:j] = (cp.mean(similarities[:, :args.csls_neighbourhood],
                                   axis=1))
        batch_num += 1
    nbrhood_z = cp.asnumpy(nbrhood_z2)
    for i in range(0, len(src), BATCH_SIZE):
        j = min(i + BATCH_SIZE, len(src))
        similarities = xw[src[i:j]].dot(zw.T)
        similarities = np.transpose(
            np.transpose(2 * similarities) - nbrhood_x[src[i:j]]) - nbrhood_z
        nn = similarities.argmax(axis=1).tolist()
        for k in range(j - i):
            translation[src[i + k]] = nn[k]

    # PIP-Stage 2
    mid = [translation[sr] for sr in src]
    xw = y.dot(w23_1)
    zw = z.dot(w23_2)
    if args.normalize_eval:
        xw = embeddings.length_normalize(xw)
        zw = embeddings.length_normalize(zw)

    translation = collections.defaultdict(int)
    translation5 = collections.defaultdict(list)
    translation10 = collections.defaultdict(list)

    t = time.time()
    nbrhood_x = np.zeros(xw.shape[0])
    nbrhood_z = np.zeros(zw.shape[0])
    nbrhood_z2 = cp.zeros(zw.shape[0])
    for i in range(0, len(mid), BATCH_SIZE):
        j = min(i + BATCH_SIZE, len(mid))
        similarities = xw[mid[i:j]].dot(zw.T)
        # similarities_x = np.sort(similarities, axis=1)
        similarities_x = -1 * np.partition(
            -1 * similarities, args.csls_neighbourhood - 1, axis=1)
        nbrhood_x[mid[i:j]] = np.mean(
            similarities_x[:, :args.csls_neighbourhood], axis=1)

    batch_num = 1
    for i in range(0, zw.shape[0], BATCH_SIZE):
        j = min(i + BATCH_SIZE, zw.shape[0])
        similarities = -1 * cp.partition(
            -1 * cp.dot(cp.asarray(zw[i:j]), cp.transpose(cp.asarray(xw))),
            args.csls_neighbourhood - 1,
            axis=1)[:, :args.csls_neighbourhood]
        nbrhood_z2[i:j] = (cp.mean(similarities[:, :args.csls_neighbourhood],
                                   axis=1))
        batch_num += 1
    nbrhood_z = cp.asnumpy(nbrhood_z2)
    for i in range(0, len(mid), BATCH_SIZE):
        j = min(i + BATCH_SIZE, len(mid))
        similarities = xw[mid[i:j]].dot(zw.T)
        similarities = np.transpose(
            np.transpose(2 * similarities) - nbrhood_x[mid[i:j]]) - nbrhood_z
        nn = similarities.argmax(axis=1).tolist()
        similarities = np.argsort((similarities), axis=1)

        nn5 = (similarities[:, -5:])
        nn10 = (similarities[:, -10:])
        for k in range(j - i):
            translation[src[i + k]] = nn[k]
            translation5[src[i + k]] = nn5[k]
            translation10[src[i + k]] = nn10[k]

    accuracy = np.mean([1 if translation[i] in src2trg[i] else 0 for i in src])
    mean = 0
    for i in src:
        for k in translation5[i]:
            if k in src2trg[i]:
                mean += 1
                break

    mean /= len(src)
    accuracy5 = mean

    mean = 0
    for i in src:
        for k in translation10[i]:
            if k in src2trg[i]:
                mean += 1
                break

    mean /= len(src)
    accuracy10 = mean
    print(
        'PIP: Coverage:{0:7.2%}  Accuracy:{1:7.2%}  Accuracy(Top 5):{2:7.2%}  Accuracy(Top 10):{3:7.2%}'
        .format(coverage, accuracy, accuracy5, accuracy10))
示例#16
0
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D  # noqa: F401 unused import
import networkx as nx
import numpy as np
import numpy.linalg as nl
from pymanopt.manifolds import PositiveDefinite
import scipy.linalg as sl

import sage.all
from sage.graphs.graph import Graph
import sage.graphs.hyperbolicity as sage_hyp

matrix_dim = 3
n_samples = 500
variance = None
manifold = PositiveDefinite(matrix_dim)


def dim_to_n(dim):
    return int((-1 + math.sqrt(1 + 8 * dim)) / 2)


def sym_to_vec(x):
    r"""The :math:`Vec(\cdot)` mapping from [1, Sec3.5]."""
    n = x.shape[0]
    y = np.copy(x)
    y[np.triu_indices(n, 1)] *= math.sqrt(2)

    return y[np.triu_indices(n)]