Пример #1
0
 def __init__(self, p_path):
     self.args = Namespace(
         base_dir=os.path.normpath(
             os.path.join(p_path, './tasks/similarity')),
         bpe_codes=os.path.normpath(
             os.path.join(p_path, './models/93langs.fcodes')),
         buffer_size=100,
         cpu=False,
         data=os.path.normpath(
             os.path.join(p_path, './tasks/similarity/dev/input')),
         encoder=os.path.normpath(
             os.path.join(p_path, './models/bilstm.93langs.2018-12-26.pt')),
         max_sentences=None,
         max_tokens=12000,
         output=os.path.normpath(
             os.path.join(p_path, './tasks/similarity/embed/output')),
         textual=False,
         verbose=True)
     self.enc = EncodeLoad(self.args)
     out_dir = os.path.dirname(self.args.output)
     if not os.path.exists(out_dir):
         print(' - creating directory {}'.format(out_dir))
         os.mkdir(out_dir)
Пример #2
0
                    default=12000,
                    help='Maximum number of tokens to process in a batch')
parser.add_argument('--max-sentences',
                    type=int,
                    default=None,
                    help='Maximum number of sentences to process in a batch')
parser.add_argument('--cpu',
                    action='store_true',
                    help='Use CPU instead of GPU')

args = parser.parse_args()

print('LASER: similarity search')

print('\nProcessing:')
enc = EncodeLoad(args)

out_dir = os.path.dirname(args.output)
if not os.path.exists(out_dir):
    print(' - creating directory {}'.format(out_dir))
    os.mkdir(out_dir)

all_data = []
all_index = []
for l in args.lang:
    Token(os.path.join(args.base_dir, args.data + '.' + l),
          os.path.join(args.base_dir, args.output + '.tok.' + l),
          lang=l,
          romanize=True if l == 'el' else False,
          lower_case=True,
          verbose=args.verbose,
Пример #3
0
print('\nLASER: paraphrase tool')
args = parser.parse_args()

# index,
# memory mapped texts, references and word counts
# encoder
params = namedtuple('params', 'idx T R W enc')

# load FAISS index
params.idx = IndexLoad(args.index, args.nprobe)

# open text and reference file
params.T, params.R, params.W = IndexTextOpen(args.text)

# load sentence encoder
params.enc = EncodeLoad(args)

margin_methods = {
    'absolute': MarginAbs,
    'distance': MarginDist,
    'ratio': MarginRatio
}

with tempfile.TemporaryDirectory() as tmpdir:
    ifile = args.input
    if args.token_lang != '--':
        ifile = os.path.join(tmpdir, 'tok')
        Token(args.input,
              ifile,
              lang=args.token_lang,
              romanize=True if args.token_lang == 'el' else False,