Пример #1
0
def extract(encoder,
            token_lang,
            bpe_codes,
            ifname,
            output,
            remove=False,
            verbose=False):
    with tempfile.TemporaryDirectory() as tmpdir:
        #        ifname = ''
        if token_lang != '--':
            tok_fname = os.path.join(tmpdir, 'tok')
            Token(ifname,
                  tok_fname,
                  lang=token_lang,
                  romanize=True if token_lang == 'el' else False,
                  lower_case=True,
                  gzip=False,
                  verbose=verbose,
                  over_write=False)
            ifname = tok_fname
        if bpe_codes:
            bpe_fname = os.path.join(tmpdir, 'bpe')
            BPEfastApply(ifname,
                         bpe_fname,
                         bpe_codes,
                         verbose=verbose,
                         over_write=True)
            ifname = bpe_fname
        EncodeFile(encoder,
                   ifname,
                   output,
                   verbose=verbose,
                   over_write=False,
                   buffer_size=10000)
        return EmbedLoad(output)
Пример #2
0
 def _loadTextAndEmb(textF, encoding, embF, encoderDim, unify, verbose):
     inds, sents = TextLoadUnify(textF, encoding, unify, verbose)
     emb = EmbedLoad(embF, encoderDim, verbose=verbose)
     if unify:
         emb = unique_embeddings(emb, inds)
     faiss.normalize_L2(emb)
     return inds, sents, emb
Пример #3
0
def load_embeds(input_file, dimension):
    x = EmbedLoad(input_file, dimension, verbose=False)
    faiss.normalize_L2(x)
    return x
Пример #4
0
def Mine(src_doc_ind, trg_doc_ind, src, trg, encoding, src_embeddings,
         trg_embeddings, output, unify, mode, retrieval, margin, neighborhood,
         gpu, dim, threshold, verbose):
    print('LASER: tool to search, score or mine bitexts', file=sys.stderr)
    if gpu:
        print(' - knn will run on all available GPUs (recommended)',
              file=sys.stderr)
    else:
        print(' - knn will run on CPU (slow)', file=sys.stderr)

    args = AttrDict({"encoding": encoding, "unify": unify, "verbose": verbose})
    src_inds, src_sents = TextLoadUnify(src, args)
    trg_inds, trg_sents = TextLoadUnify(trg, args)

    def unique_embeddings(emb, ind, verbose=False):
        aux = {j: i for i, j in enumerate(ind)}
        if verbose:
            print(' - unify embeddings: {:d} -> {:d}'.format(
                len(emb), len(aux)),
                  file=sys.stderr)
        return emb[[aux[i] for i in range(len(aux))]]

    # load the embeddings
    x = EmbedLoad(src_embeddings, dim, verbose=verbose)
    if unify:
        x = unique_embeddings(x, src_inds, verbose)
    faiss.normalize_L2(x)
    y = EmbedLoad(trg_embeddings, dim, verbose=verbose)
    if unify:
        y = unique_embeddings(y, trg_inds, verbose)
    faiss.normalize_L2(y)

    # calculate knn in both directions
    if retrieval != 'bwd':
        if verbose:
            print(' - perform {:d}-nn source against target'.format(
                neighborhood),
                  file=sys.stderr)
        x2y_sim, x2y_ind = knn(x, y, min(y.shape[0], neighborhood), gpu)
        x2y_mean = x2y_sim.mean(axis=1)

    if retrieval != 'fwd':
        if verbose:
            print(' - perform {:d}-nn target against source'.format(
                neighborhood),
                  file=sys.stderr)
        y2x_sim, y2x_ind = knn(y, x, min(x.shape[0], neighborhood), gpu)
        y2x_mean = y2x_sim.mean(axis=1)

    # margin function
    if margin == 'absolute':

        def margin(a, b):
            return a
    elif margin == 'distance':

        def margin(a, b):
            return a - b
    else:  # margin == 'ratio':

        def margin(a, b):
            return a / b

    if output:
        if output.endswith('.xz'):
            fout = lzma.open(output,
                             mode='at',
                             encoding=encoding,
                             errors='surrogateescape')
        else:
            fout = open(output,
                        mode='a',
                        encoding=encoding,
                        errors='surrogateescape')
    else:
        output = "stdout"
        fout = sys.stdout

    if mode == 'search':
        if verbose:
            print(' - Searching for closest sentences in target',
                  file=sys.stderr)
            print(' - writing alignments to {:s}'.format(output),
                  file=sys.stderr)
        scores = score_candidates(x, y, x2y_ind, x2y_mean, y2x_mean, margin,
                                  verbose)
        best = x2y_ind[np.arange(x.shape[0]), scores.argmax(axis=1)]

        nbex = x.shape[0]
        ref = np.linspace(0, nbex - 1, nbex).astype(int)  # [0, nbex)
        err = nbex - np.equal(best.reshape(nbex), ref).astype(int).sum()
        print(' - errors: {:d}={:.2f}%'.format(err, 100 * err / nbex),
              file=sys.stderr)
        for i in src_inds:
            print(trg_sents[best[i]], file=fout)

    elif mode == 'score':
        for i, j in zip(src_inds, trg_inds):
            s = score(x[i], y[j], x2y_mean[i], y2x_mean[j], margin)
            print(s, src_sents[i], trg_sents[j], sep='\t', file=fout)

    elif mode == 'mine':
        if verbose:
            print(' - mining for parallel data', file=sys.stderr)
        fwd_scores = score_candidates(x, y, x2y_ind, x2y_mean, y2x_mean,
                                      margin, verbose)
        bwd_scores = score_candidates(y, x, y2x_ind, y2x_mean, x2y_mean,
                                      margin, verbose)
        fwd_best = x2y_ind[np.arange(x.shape[0]), fwd_scores.argmax(axis=1)]
        bwd_best = y2x_ind[np.arange(y.shape[0]), bwd_scores.argmax(axis=1)]
        if verbose:
            print(' - writing alignments to {:s}'.format(output),
                  file=sys.stderr)
            if threshold > 0:
                print(' - with threshold of {:f}'.format(threshold),
                      file=sys.stderr)
        if retrieval == 'fwd':
            for i, j in enumerate(fwd_best):
                print(fwd_scores[i].max(),
                      src_sents[i],
                      trg_sents[j],
                      sep='\t',
                      file=fout)
        if retrieval == 'bwd':
            for j, i in enumerate(bwd_best):
                print(bwd_scores[j].max(),
                      src_sents[i],
                      trg_sents[j],
                      sep='\t',
                      file=fout)
        if retrieval == 'intersect':
            for i, j in enumerate(fwd_best):
                if bwd_best[j] == i:
                    print(fwd_scores[i].max(),
                          src_sents[i],
                          trg_sents[j],
                          sep='\t',
                          file=fout)
        if retrieval == 'max':
            indices = np.stack(
                (np.concatenate((np.arange(x.shape[0]), bwd_best)),
                 np.concatenate((fwd_best, np.arange(y.shape[0])))),
                axis=1)
            scores = np.concatenate(
                (fwd_scores.max(axis=1), bwd_scores.max(axis=1)))
            seen_src, seen_trg = set(), set()
            for i in np.argsort(-scores):
                src_ind, trg_ind = indices[i]
                if src_ind not in seen_src and trg_ind not in seen_trg:
                    seen_src.add(src_ind)
                    seen_trg.add(trg_ind)
                    if scores[i] > threshold:
                        print(src_doc_ind,
                              trg_doc_ind,
                              src_sents[src_ind],
                              trg_sents[trg_ind],
                              scores[i],
                              sep='\t',
                              file=fout)

    if fout != sys.stdout:
        fout.close()
Пример #5
0
        print(' - knn will run on all available GPUs (recommended)')
    else:
        print(' - knn will run on CPU (slow)')

    src_inds, src_sents = TextLoadUnify(args.src, args)
    trg_inds, trg_sents = TextLoadUnify(args.trg, args)

    def unique_embeddings(emb, ind, verbose=False):
        aux = {j: i for i, j in enumerate(ind)}
        if verbose:
            print(' - unify embeddings: {:d} -> {:d}'.format(
                len(emb), len(aux)))
        return emb[[aux[i] for i in range(len(aux))]]

    # load the embeddings
    x = EmbedLoad(args.src_embeddings, args.dim, verbose=args.verbose)
    if args.unify:
        x = unique_embeddings(x, src_inds, args.verbose)
    faiss.normalize_L2(x)
    y = EmbedLoad(args.trg_embeddings, args.dim, verbose=args.verbose)
    if args.unify:
        y = unique_embeddings(y, trg_inds, args.verbose)
    faiss.normalize_L2(y)

    ###########################
    # modification 20200531
    ###########################
    if args.cmp_embeddings is None:  # matching tms
        z = None
    else:  # matching tmt
        z = EmbedLoad(args.cmp_embeddings, args.dim, verbose=args.verbose)
Пример #6
0
    src_inds, src_sents = TextLoadUnify(args.src, args)
    trg_inds, trg_sents = TextLoadUnify(args.trg, args)

    if args.trans:
        srcTransFile = ".enTranslated".join(args.src.rsplit('.bn', 1))
        srcTrans_inds, srcTrans_sents = TextLoadUnify(srcTransFile, args)

    def unique_embeddings(emb, ind, verbose=False):
        aux = {j: i for i, j in enumerate(ind)}
        if verbose:
            print(' - unify embeddings: {:d} -> {:d}'.format(
                len(emb), len(aux)))
        return emb[[aux[i] for i in range(len(aux))]]

    # load the embeddings
    x = EmbedLoad(args.src_embeddings, args.dim, verbose=args.verbose)

    if args.trans:
        xTrans = EmbedLoad(".enTranslated".join(
            args.src_embeddings.rsplit('.bn', 1)),
                           args.dim,
                           verbose=args.verbose)

    if args.unify:
        x = unique_embeddings(x, src_inds, args.verbose)
        if args.trans:
            xTrans = unique_embeddings(xTrans, src_inds, args.verbose)

    if args.trans:
        faiss.normalize_L2(xTrans)