def Mine(src_doc_ind, trg_doc_ind, src, trg, encoding, src_embeddings, trg_embeddings, output, unify, mode, retrieval, margin, neighborhood, gpu, dim, threshold, verbose): print('LASER: tool to search, score or mine bitexts', file=sys.stderr) if gpu: print(' - knn will run on all available GPUs (recommended)', file=sys.stderr) else: print(' - knn will run on CPU (slow)', file=sys.stderr) args = AttrDict({"encoding": encoding, "unify": unify, "verbose": verbose}) src_inds, src_sents = TextLoadUnify(src, args) trg_inds, trg_sents = TextLoadUnify(trg, args) def unique_embeddings(emb, ind, verbose=False): aux = {j: i for i, j in enumerate(ind)} if verbose: print(' - unify embeddings: {:d} -> {:d}'.format( len(emb), len(aux)), file=sys.stderr) return emb[[aux[i] for i in range(len(aux))]] # load the embeddings x = EmbedLoad(src_embeddings, dim, verbose=verbose) if unify: x = unique_embeddings(x, src_inds, verbose) faiss.normalize_L2(x) y = EmbedLoad(trg_embeddings, dim, verbose=verbose) if unify: y = unique_embeddings(y, trg_inds, verbose) faiss.normalize_L2(y) # calculate knn in both directions if retrieval != 'bwd': if verbose: print(' - perform {:d}-nn source against target'.format( neighborhood), file=sys.stderr) x2y_sim, x2y_ind = knn(x, y, min(y.shape[0], neighborhood), gpu) x2y_mean = x2y_sim.mean(axis=1) if retrieval != 'fwd': if verbose: print(' - perform {:d}-nn target against source'.format( neighborhood), file=sys.stderr) y2x_sim, y2x_ind = knn(y, x, min(x.shape[0], neighborhood), gpu) y2x_mean = y2x_sim.mean(axis=1) # margin function if margin == 'absolute': def margin(a, b): return a elif margin == 'distance': def margin(a, b): return a - b else: # margin == 'ratio': def margin(a, b): return a / b if output: if output.endswith('.xz'): fout = lzma.open(output, mode='at', encoding=encoding, errors='surrogateescape') else: fout = open(output, mode='a', encoding=encoding, errors='surrogateescape') else: output = "stdout" fout = sys.stdout if mode == 'search': if verbose: print(' - Searching for closest sentences in target', file=sys.stderr) print(' - writing alignments to {:s}'.format(output), file=sys.stderr) scores = score_candidates(x, y, x2y_ind, x2y_mean, y2x_mean, margin, verbose) best = x2y_ind[np.arange(x.shape[0]), scores.argmax(axis=1)] nbex = x.shape[0] ref = np.linspace(0, nbex - 1, nbex).astype(int) # [0, nbex) err = nbex - np.equal(best.reshape(nbex), ref).astype(int).sum() print(' - errors: {:d}={:.2f}%'.format(err, 100 * err / nbex), file=sys.stderr) for i in src_inds: print(trg_sents[best[i]], file=fout) elif mode == 'score': for i, j in zip(src_inds, trg_inds): s = score(x[i], y[j], x2y_mean[i], y2x_mean[j], margin) print(s, src_sents[i], trg_sents[j], sep='\t', file=fout) elif mode == 'mine': if verbose: print(' - mining for parallel data', file=sys.stderr) fwd_scores = score_candidates(x, y, x2y_ind, x2y_mean, y2x_mean, margin, verbose) bwd_scores = score_candidates(y, x, y2x_ind, y2x_mean, x2y_mean, margin, verbose) fwd_best = x2y_ind[np.arange(x.shape[0]), fwd_scores.argmax(axis=1)] bwd_best = y2x_ind[np.arange(y.shape[0]), bwd_scores.argmax(axis=1)] if verbose: print(' - writing alignments to {:s}'.format(output), file=sys.stderr) if threshold > 0: print(' - with threshold of {:f}'.format(threshold), file=sys.stderr) if retrieval == 'fwd': for i, j in enumerate(fwd_best): print(fwd_scores[i].max(), src_sents[i], trg_sents[j], sep='\t', file=fout) if retrieval == 'bwd': for j, i in enumerate(bwd_best): print(bwd_scores[j].max(), src_sents[i], trg_sents[j], sep='\t', file=fout) if retrieval == 'intersect': for i, j in enumerate(fwd_best): if bwd_best[j] == i: print(fwd_scores[i].max(), src_sents[i], trg_sents[j], sep='\t', file=fout) if retrieval == 'max': indices = np.stack( (np.concatenate((np.arange(x.shape[0]), bwd_best)), np.concatenate((fwd_best, np.arange(y.shape[0])))), axis=1) scores = np.concatenate( (fwd_scores.max(axis=1), bwd_scores.max(axis=1))) seen_src, seen_trg = set(), set() for i in np.argsort(-scores): src_ind, trg_ind = indices[i] if src_ind not in seen_src and trg_ind not in seen_trg: seen_src.add(src_ind) seen_trg.add(trg_ind) if scores[i] > threshold: print(src_doc_ind, trg_doc_ind, src_sents[src_ind], trg_sents[trg_ind], scores[i], sep='\t', file=fout) if fout != sys.stdout: fout.close()
def exec_function(self, args): setCUDA_VISIBLE_DEVICES(args.gpuid) bpeCodesF_local = LASER + '/models/93langs.fcodes' encoderF_local = LASER + '/models/bilstm.93langs.2018-12-26.pt' ################# # Parse arguments and retrieve files ################# srcF_local = os.path.join(self._data_dir, self._storage.split(args.srcfile)[-1]) tgtF_local = os.path.join(self._data_dir, self._storage.split(args.tgtfile)[-1]) self._storage.get_file(args.srcfile, srcF_local) self._storage.get_file(args.tgtfile, tgtF_local) outputF_local = os.path.join(self._data_dir, self._storage.split(args.output)[-1]) if args.bpecodes is not None: bpeCodesF_local = os.path.join(self._data_dir, self._storage.split(args.bpecodes)[-1]) self._storage.get_file(args.bpecodes, bpeCodesF_local) if args.encoder is not None: encoderF_local = os.path.join(self._data_dir, self._storage.split(args.encoder)[-1]) self._storage.get_file(args.encoder, encoderF_local) if args.srclang is None: args.srclang = inferLangFromFilename(args.srcfile) if args.tgtlang is None: args.tgtlang = inferLangFromFilename(args.tgtfile) logger.info("srclang: %s, srcfile: %s (%s)" % (args.srclang, args.srcfile, srcF_local)) logger.info("tgtlang: %s, tgtfile: %s (%s)" % (args.tgtlang, args.tgtfile, tgtF_local)) logger.info("output: %s (%s)" % (args.output, outputF_local)) logger.info("encoderF: %s (%s)" % (args.encoder, encoderF_local)) logger.info("bpeCodesF: %s (%s)" % (args.bpecodes, bpeCodesF_local)) ################# # Perform tasks ################# with tempfile.TemporaryDirectory() as tmpdir: srcTokF = os.path.join(tmpdir, 'srctok') srcBpeF = os.path.join(tmpdir, 'srcbpe') srcEmbF = os.path.join(tmpdir, 'srcemb') tgtTokF = os.path.join(tmpdir, 'tgttok') tgtBpeF = os.path.join(tmpdir, 'tgtbpe') tgtEmbF = os.path.join(tmpdir, 'tgtemb') logger.debug(' - gpuid: %s' % args.gpuid) if isinstance(args.gpuid, list): logger.debug(' - perform src and tgt embedding in parallel') import torch.multiprocessing as mp srcP = mp.Process(target=TokBpeEmb, args=(args.srclang, srcF_local, srcTokF, srcBpeF, srcEmbF, bpeCodesF_local, encoderF_local, args.encoderbuffersize, args.encodermaxtokens, args.verbose, args.gpuid[0])) srcP.start() tgtP = mp.Process(target=TokBpeEmb, args=(args.tgtlang, tgtF_local, tgtTokF, tgtBpeF, tgtEmbF, bpeCodesF_local, encoderF_local, args.encoderbuffersize, args.encodermaxtokens, args.verbose, args.gpuid[1])) tgtP.start() srcP.join() tgtP.join() else: logger.info(' - perform src and tgt embedding in series') encoder = loadEncoder(encoderF_local, args.encoderbuffersize, args.encodermaxtokens, cpu=(args.gpuid == 0)) TokBpeEmb(args.srclang, srcF_local, srcTokF, srcBpeF, srcEmbF, bpeCodesF_local, encoder, args.encoderbuffersize, args.encodermaxtokens, args.verbose, args.gpuid) TokBpeEmb(args.tgtlang, tgtF_local, tgtTokF, tgtBpeF, tgtEmbF, bpeCodesF_local, encoder, args.encoderbuffersize, args.encodermaxtokens, args.verbose, args.gpuid) # LASER options setCUDA_VISIBLE_DEVICES(args.gpuid) unify, retrieval, neighborhood, gpu = True, 'max', 5, (args.gpuid != 0) # load bitext and embeddings def _loadTextAndEmb(textF, encoding, embF, encoderDim, unify, verbose): inds, sents = TextLoadUnify(textF, encoding, unify, verbose) emb = EmbedLoad(embF, encoderDim, verbose=verbose) if unify: emb = unique_embeddings(emb, inds) faiss.normalize_L2(emb) return inds, sents, emb src_inds, src_sents, x = _loadTextAndEmb(srcF_local, args.encoding, srcEmbF, args.encoderdim, unify, args.verbose) trg_inds, trg_sents, y = _loadTextAndEmb(tgtF_local, args.encoding, tgtEmbF, args.encoderdim, unify, args.verbose) # calculate knn in both directions if retrieval != 'bwd': logger.info(' - perform {:d}-nn source against target'.format(neighborhood)) x2y_sim, x2y_ind = knn(x, y, min(y.shape[0], neighborhood), gpu) x2y_mean = x2y_sim.mean(axis=1) if retrieval != 'fwd': logger.info(' - perform {:d}-nn target against source'.format(neighborhood)) y2x_sim, y2x_ind = knn(y, x, min(x.shape[0], neighborhood), gpu) y2x_mean = y2x_sim.mean(axis=1) # margin function if args.margin == 'absolute': margin = lambda a, b: a elif args.margin == 'distance': margin = lambda a, b: a - b else: # args.margin == 'ratio': margin = lambda a, b: a / b if args.tumode == 'score': scoreBitext(src_inds, trg_inds, x, y, x2y_mean, y2x_mean, outputF_local, args.encoding, margin) self._storage.push(outputF_local, args.output) statCnt, statMin, statMax, statAvg, statStddev = getScoreDist(outputF_local) elif args.tumode == 'mine': src_suffix, tgt_suffix = '', '' if args.srclang == args.tgtlang: src_suffix, tgt_suffix = "_s", "_t" foutSrc, foutSrc_remote = outputF_local+'.'+args.srclang+src_suffix, args.output+'.'+args.srclang+src_suffix if srcF_local.endswith('.gz'): foutSrc = foutSrc+'.gz' foutSrc_remote = foutSrc_remote+'.gz' foutTgt, foutTgt_remote = outputF_local+'.'+args.tgtlang+tgt_suffix, args.output+'.'+args.tgtlang+tgt_suffix if tgtF_local.endswith('.gz'): foutTgt = foutTgt+'.gz' foutTgt_remote = foutTgt_remote+'.gz' foutScore, foutScore_remote = outputF_local+'.tuminer-score', args.output+'.tuminer-score' mineBitext(src_sents, trg_sents, x, y, x2y_ind, x2y_mean, y2x_ind, y2x_mean, foutSrc, foutTgt, foutScore, args.encoding, margin, retrieval, args.threshold, args.verbose) self._storage.push(foutSrc, foutSrc_remote) self._storage.push(foutTgt, foutTgt_remote) self._storage.push(foutScore, foutScore_remote) statCnt, statMin, statMax, statAvg, statStddev = getScoreDist(foutScore) logger.info('Score statistics -- CNT: {:d}, MIN: {:f}, MAX: {:f}, AVG: {:f}, STDDEV: {:f}' .format(statCnt, statMin, statMax, statAvg, statStddev))