def __init__(self, p_path): self.args = Namespace( base_dir=os.path.normpath( os.path.join(p_path, './tasks/similarity')), bpe_codes=os.path.normpath( os.path.join(p_path, './models/93langs.fcodes')), buffer_size=100, cpu=False, data=os.path.normpath( os.path.join(p_path, './tasks/similarity/dev/input')), encoder=os.path.normpath( os.path.join(p_path, './models/bilstm.93langs.2018-12-26.pt')), max_sentences=None, max_tokens=12000, output=os.path.normpath( os.path.join(p_path, './tasks/similarity/embed/output')), textual=False, verbose=True) self.enc = EncodeLoad(self.args) out_dir = os.path.dirname(self.args.output) if not os.path.exists(out_dir): print(' - creating directory {}'.format(out_dir)) os.mkdir(out_dir)
default=12000, help='Maximum number of tokens to process in a batch') parser.add_argument('--max-sentences', type=int, default=None, help='Maximum number of sentences to process in a batch') parser.add_argument('--cpu', action='store_true', help='Use CPU instead of GPU') args = parser.parse_args() print('LASER: similarity search') print('\nProcessing:') enc = EncodeLoad(args) out_dir = os.path.dirname(args.output) if not os.path.exists(out_dir): print(' - creating directory {}'.format(out_dir)) os.mkdir(out_dir) all_data = [] all_index = [] for l in args.lang: Token(os.path.join(args.base_dir, args.data + '.' + l), os.path.join(args.base_dir, args.output + '.tok.' + l), lang=l, romanize=True if l == 'el' else False, lower_case=True, verbose=args.verbose,
print('\nLASER: paraphrase tool') args = parser.parse_args() # index, # memory mapped texts, references and word counts # encoder params = namedtuple('params', 'idx T R W enc') # load FAISS index params.idx = IndexLoad(args.index, args.nprobe) # open text and reference file params.T, params.R, params.W = IndexTextOpen(args.text) # load sentence encoder params.enc = EncodeLoad(args) margin_methods = { 'absolute': MarginAbs, 'distance': MarginDist, 'ratio': MarginRatio } with tempfile.TemporaryDirectory() as tmpdir: ifile = args.input if args.token_lang != '--': ifile = os.path.join(tmpdir, 'tok') Token(args.input, ifile, lang=args.token_lang, romanize=True if args.token_lang == 'el' else False,