def main(): parser = argparse.ArgumentParser(description='Classify text with a model') parser.add_argument( '--model', help= 'The path to either the .zip file created by training or to the client bundle ' 'created by exporting', required=True, type=str) parser.add_argument( '--text', help= 'The text to classify as a string, or a path to a file with each line as an example', type=str) parser.add_argument('--backend', help='backend', choices={'tf', 'pytorch', 'onnx'}, default='tf') parser.add_argument( '--remote', help='(optional) remote endpoint, normally localhost:8500', type=str) # localhost:8500 parser.add_argument( '--name', help='(optional) service name as the server may serves multiple models', type=str) parser.add_argument('--device', help='device') parser.add_argument('--preproc', help='(optional) where to perform preprocessing', choices={'client', 'server'}, default='client') parser.add_argument('--batchsz', help='batch size when --text is a file', default=100, type=int) parser.add_argument('--model_type', type=str, default='default') parser.add_argument('--modules', default=[], nargs="+") parser.add_argument( '--prefer_eager', help="If running in TensorFlow, should we prefer eager model", type=str2bool, default=False) parser.add_argument('--scores', '-s', action="store_true") args = parser.parse_args() if args.backend == 'tf': from eight_mile.tf.layers import set_tf_eager_mode set_tf_eager_mode(args.prefer_eager) for mod_name in args.modules: bl.import_user_module(mod_name) if os.path.exists(args.text) and os.path.isfile(args.text): texts = [] with open(args.text, 'r') as f: for line in f: text = line.strip().split() texts += [text] else: texts = [args.text.split()] batched = [ texts[i:i + args.batchsz] for i in range(0, len(texts), args.batchsz) ] m = bl.ClassifierService.load(args.model, backend=args.backend, remote=args.remote, name=args.name, preproc=args.preproc, device=args.device, model_type=args.model_type) for texts in batched: for text, output in zip(texts, m.predict(texts)): if args.scores: print("{}, {}".format(" ".join(text), output)) else: print("{}, {}".format(" ".join(text), output[0][0]))
def main(): parser = argparse.ArgumentParser(description='Tag text with a model') parser.add_argument('--model', help='A tagger model with extended features', required=True, type=str) parser.add_argument('--text', help='raw value', type=str) parser.add_argument('--conll', help='is file type conll?', type=str2bool, default=False) parser.add_argument( '--features', help= '(optional) features in the format feature_name:index (column # in conll) or ' 'just feature names (assumed sequential)', default=[], nargs='+') parser.add_argument('--backend', help='backend', default='tf') parser.add_argument('--device', help='device') parser.add_argument('--remote', help='(optional) remote endpoint', type=str) # localhost:8500 parser.add_argument('--name', help='(optional) signature name', type=str) parser.add_argument('--preproc', help='(optional) where to perform preprocessing', choices={'client', 'server'}, default='client') parser.add_argument( '--export_mapping', help='mapping between features and the fields in the grpc/ REST ' 'request, eg: token:word ner:ner. This should match with the ' '`exporter_field` definition in the mead config', default=[], nargs='+') parser.add_argument( '--prefer_eager', help="If running in TensorFlow, should we prefer eager model", type=str2bool) parser.add_argument('--modules', default=[], nargs="+") parser.add_argument( '--batchsz', default=64, help="How many examples to run through the model at once", type=int) args = parser.parse_args() if args.backend == 'tf': from eight_mile.tf.layers import set_tf_eager_mode set_tf_eager_mode(args.prefer_eager) for mod_name in args.modules: bl.import_user_module(mod_name) def create_export_mapping(feature_map_strings): feature_map_strings = [ x.strip() for x in feature_map_strings if x.strip() ] if not feature_map_strings: return {} else: return { x[0]: x[1] for x in [y.split(':') for y in feature_map_strings] } def feature_index_mapping(features): if not features: return {} elif ':' in features[0]: return { feature.split(':')[0]: int(feature.split(':')[1]) for feature in features } else: return {feature: index for index, feature in enumerate(features)} if os.path.exists(args.text) and os.path.isfile(args.text): texts = [] if args.conll: feature_indices = feature_index_mapping(args.features) for sentence in read_conll(args.text): if feature_indices: texts.append( [{k: line[v] for k, v in feature_indices.items()} for line in sentence]) else: texts.append([line[0] for line in sentence]) else: with open(args.text, 'r') as f: for line in f: text = line.strip().split() texts += [text] else: texts = [args.text.split()] m = bl.TaggerService.load(args.model, backend=args.backend, remote=args.remote, name=args.name, preproc=args.preproc, device=args.device) batched = [ texts[i:i + args.batchsz] for i in range(0, len(texts), args.batchsz) ] for texts in batched: for sen in m.predict(texts, export_mapping=create_export_mapping( args.export_mapping)): for word_tag in sen: print("{} {}".format(word_tag['text'], word_tag['label'])) print()
parser.add_argument('--model_type', type=str, default='default') parser.add_argument('--modules', default=[]) parser.add_argument( '--prefer_eager', help="If running in TensorFlow, should we prefer eager model", type=str2bool, default=False) parser.add_argument('--scores', '-s', action="store_true") args = parser.parse_args() if args.backend == 'tf': from eight_mile.tf.layers import set_tf_eager_mode set_tf_eager_mode(args.prefer_eager) for mod_name in args.modules: bl.import_user_module(mod_name) if os.path.exists(args.text) and os.path.isfile(args.text): texts = [] with open(args.text, 'r') as f: for line in f: text = line.strip().split() texts += [text] else: texts = [args.text.split()] batched = [ texts[i:i + args.batchsz] for i in range(0, len(texts), args.batchsz) ] m = bl.ClassifierService.load(args.model,
def run(input_files=[], input_pattern='*.txt', codes=None, vocab=None, nctx=256, fmt='json', fields=['x_str', 'y_str'], output=None, prefix=None, suffix=None, max_file_size=100, tok_on_eol="<EOS>", cased=True, mask_type="mlm", module=None, pad_y=True, extra_tokens=['[CLS]', '[MASK]'], world_size=1, world_offset=0, input_field='text', tokenizer_type=None, **kwargs): def parse_json_line(x): return json.loads(x)[input_field] if module: logger.warning("Loading custom user module %s for masking rules and tokenizers", module) baseline.import_user_module(module) get_line = lambda x: x.strip() if os.path.isdir(input_files): if '.json' in input_pattern: get_line = parse_json_line input_files = list(glob.glob(os.path.join(input_files, input_pattern))) if not output: output = os.path.join(input_files, 'records') else: if '.json' in input_files: get_line = parse_json_line input_files = [input_files] if not output: output = f'{input_files}.records' if len(input_files) < world_size: raise Exception(f"The number of input shards ({len(input_files)})should be greater than the world_size: {world_size}") logger.info('Output [%s]', output) transform = baseline.lowercase if not cased else lambda x: x vectorizer = BPEVectorizer1D(transform_fn=transform, model_file=codes, vocab_file=vocab, mxlen=1024, extra_tokens=extra_tokens) lookup_indices = [] indices2word = baseline.revlut(vectorizer.vocab) root_dir = os.path.dirname(output) tokenizer = create_tokenizer(tokenizer_type) masking = create_masking(mask_type, vectorizer.vocab, pad_y) if not os.path.exists(root_dir): os.makedirs(root_dir) if prefix: nctx -= 1 prefix = vectorizer.vocab[prefix] if suffix: nctx -= 1 suffix = vectorizer.vocab[suffix] fw = create_file_writer(fmt, output, fields, max_file_size, 1000 * world_offset) num_samples = 0 for i, text in enumerate(input_files): if i % world_size != world_offset: continue with TextFile(text) as rf: print(f"Reading from {text}...") for line in rf: to_bpe = tokenizer(get_line(line)) if not to_bpe: continue to_bpe += [tok_on_eol] output, available = vectorizer.run(to_bpe, vectorizer.vocab) while available > 0: if len(lookup_indices) == nctx: record = create_record(lookup_indices, indices2word, prefix, suffix, masking=masking) fw.write(record) num_samples += 1 lookup_indices = [] needed = nctx - len(lookup_indices) if available >= needed: lookup_indices += output[:needed].tolist() output = output[needed:] available -= needed record = create_record(lookup_indices, indices2word, prefix, suffix, masking=masking) fw.write(record) num_samples += 1 lookup_indices = [] # The amount available is less than what we need, so read the whole thing else: lookup_indices += output[:available].tolist() available = 0 fw.close() f_name = f'md-{world_offset}.yml' if world_size > 1 else 'md.yml' write_yaml({'num_samples': num_samples}, os.path.join(root_dir, f_name))
def main(): parser = argparse.ArgumentParser(description='Classify text with a model') parser.add_argument( '--model', help= 'The path to either the .zip file created by training or to the client bundle ' 'created by exporting', required=True, type=str) parser.add_argument('--config', type=str, required=True) parser.add_argument('--text1', type=str) parser.add_argument('--text2', type=str) parser.add_argument('--file', type=str) parser.add_argument('--backend', help='backend', choices={'tf', 'pytorch'}, default='pytorch') parser.add_argument('--device', help='device') parser.add_argument('--batchsz', help='batch size when --text is a file', default=100, type=int) parser.add_argument('--modules', default=[]) args = parser.parse_args() if args.backend == 'tf': from eight_mile.tf.layers import set_tf_eager_mode set_tf_eager_mode(args.prefer_eager) for mod_name in args.modules: bl.import_user_module(mod_name) if os.path.exists(args.file) and os.path.isfile(args.file): df = pd.read_csv(args.file) text_1 = [x.strip().split() for x in df['hypothesis']] text_2 = [x.strip().split() for x in df['premise']] else: text_1 = [args.text1.split()] text_2 = [args.text2.split()] text_1_batched = [ text_1[i:i + args.batchsz] for i in range(0, len(text_1), args.batchsz) ] text_2_batched = [ text_2[i:i + args.batchsz] for i in range(0, len(text_2), args.batchsz) ] config = read_config_file_or_json(args.config) loader_config = config['loader'] model_type = config['model']['model_type'] model = NLIService.load(args.model, model_type=model_type, backend=args.backend, device=args.device, **loader_config) for text_1_batch, text_2_batch in zip(text_1_batched, text_2_batched): output_batch = model.predict(text_1_batch, text_2_batch) for text_1, text_2, output in zip(text_1_batch, text_2_batch, output_batch): print("text1: {}, text2: {}, prediction: {}".format( " ".join(text_1), " ".join(text_2), output[0][0])) print('=' * 30)
def run(input_files=[], input_pattern='*.txt', codes=None, vocab=None, nctx=256, fmt='json', fields=['x_str', 'y_str'], output=None, x_prefix=None, x_suffix=None, y_prefix=None, y_suffix=None, max_file_size=100, cased=True, mask_type="mlm", module=None, pad_y=True, extra_tokens=['[CLS]', '[MASK]'], tgt_nctx=None, world_size=1, world_offset=0, subword_type='bpe', **kwargs): timer = Timer() if module: logger.warning("Loading custom user module %s for masking rules", module) baseline.import_user_module(module) if os.path.isdir(input_files): import glob input_files = list(glob.glob(os.path.join(input_files, input_pattern))) if not output: output = os.path.join(input_files, 'records') else: input_files = [input_files] if not output: output = f'{input_files}.records' logger.info('Output [%s]', output) if not tgt_nctx: tgt_nctx = 64 transform = baseline.lowercase if not cased else lambda x: x Vec1D = get_subword_vec1d(subword_type) vectorizer = Vec1D(transform_fn=transform, model_file=codes, vocab_file=vocab, mxlen=1024, extra_tokens=extra_tokens) if x_prefix: x_prefix = vectorizer.vocab[x_prefix] if x_suffix: x_suffix = vectorizer.vocab[x_suffix] if y_prefix: y_prefix = vectorizer.vocab[y_prefix] if y_suffix: y_suffix = vectorizer.vocab[y_suffix] indices2word = baseline.revlut(vectorizer.vocab) root_dir = os.path.dirname(output) masking = create_masking(mask_type, vectorizer.vocab, pad_y) if not os.path.exists(root_dir): os.makedirs(root_dir) # Create a file writer for this shard fw = create_file_writer(fmt, output, fields, max_file_size, 1000 * world_offset) num_read = -1 num_samples_this_worker = 0 for text in input_files: with open(text, encoding='utf-8') as rf: print(f"Reading from {text}...") for line in rf: num_read += 1 if num_read % world_size != world_offset: continue to_bpe = line.strip().split() if not to_bpe: continue output, available = vectorizer.run(to_bpe, vectorizer.vocab) x, y = masking(output[:available], False, False) if x_prefix: x = [x_prefix] + x if y_prefix: y = [y_prefix] + y if x_suffix: x += [x_suffix] if y_suffix: y += [y_suffix] x = x[:nctx] y = y[:tgt_nctx] x_t = np.zeros(nctx, dtype=output.dtype) y_t = np.zeros(tgt_nctx, dtype=output.dtype) x_t[:len(x)] = x y_t[:len(y)] = y record = { 'x': x_t, 'y': y_t, 'x_str': [indices2word[s] for s in x_t], 'y_str': [indices2word[s] for s in y_t] } if masking.is_valid(record): fw.write(record) num_samples_this_worker += 1 fw.close() duration = timer.elapsed() print("Processed {:,} samples in {:.2f}s".format(num_samples_this_worker, duration)) f_name = f'md-{world_offset}.yml' if world_size > 1 else 'md.yml' write_yaml({'num_samples': num_samples_this_worker}, os.path.join(root_dir, f_name))
def main(): parser = argparse.ArgumentParser( description='Classify paired text with a model') parser.add_argument( '--model', help= 'The path to either the .zip file created by training or to the client bundle ' 'created by exporting', required=True, type=str) parser.add_argument( '--text', help= 'The text to classify as a string, or a path to a file with each line as an example', type=str) parser.add_argument('--backend', help='backend', choices={'tf', 'pytorch', 'onnx'}, default='pytorch') parser.add_argument( '--remote', help='(optional) remote endpoint, normally localhost:8500', type=str) # localhost:8500 parser.add_argument( '--name', help='(optional) service name as the server may serves multiple models', type=str) parser.add_argument('--device', help='device') parser.add_argument('--preproc', help='(optional) where to perform preprocessing', choices={'client', 'server'}, default='client') parser.add_argument('--batchsz', help='batch size when --text is a file', default=100, type=int) parser.add_argument('--model_type', type=str, default='default') parser.add_argument('--modules', default=[], nargs="+") parser.add_argument('--scores', '-s', action="store_true") parser.add_argument('--label_first', action="store_true", help="Use the second column") parser.add_argument("--output_delim", default="\t") parser.add_argument("--output_type", default="tsv", choices=["tsv", "json"]) parser.add_argument("--no_text_output", action="store_true", help="Dont write the text") args = parser.parse_args() for mod_name in args.modules: bl.import_user_module(mod_name) labels = [] if os.path.exists(args.text) and os.path.isfile(args.text): texts = [] with open(args.text, 'r') as f: for line in f: text = line.strip().split('\t') if args.label_first: labels.append(text[0]) text = text[1:] first = text[0].split() second = text[1].split() text = [first, second] texts += [text] else: texts = [args.text.split()] batched = [ texts[i:i + args.batchsz] for i in range(0, len(texts), args.batchsz) ] m = bl.ClassifierService.load(args.model, backend=args.backend, remote=args.remote, name=args.name, preproc=args.preproc, device=args.device, model_type=args.model_type) if args.label_first: label_iter = iter(labels) for texts in batched: for text, output in zip(texts, m.predict(texts)): if args.scores: guess_output = output else: guess_output = output[0][0] if args.output_type == 'tsv': if args.no_text_output: text_output = '' else: text = text[0] + [args.output_delim] + text[1] text_output = ' '.join(text) + {args.output_delim} s = f"{text_output}{guess_output}" if args.label_first: s = f"{next(label_iter)}{args.output_delim}{s}" else: text_output = [' '.join(text[0]), ' '.join(text[1])] if args.scores: guess_output = {kv[0]: kv[1] for kv in guess_output} json_output = {'prediction': guess_output} if not args.no_text_output: json_output['text'] = text_output if args.label_first: json_output['label'] = next(label_iter) s = json.dumps(json_output) print(s)
def main(): parser = argparse.ArgumentParser(description='Run senteval harness') parser.add_argument('--nctx', default=512, type=int) parser.add_argument("--module", default=None, help="Module containing custom tokenizers") parser.add_argument('--tasks', nargs="+", default=['sts', 'class', 'probe']) parser.add_argument('--batchsz', default=20, type=int) parser.add_argument('--tok', help='Optional tokenizer, e.g. "gpt2" or "basic". These can be defined in extra module') parser.add_argument('--pool', help='Should a reduction be applied on the embeddings? Only use if your embeddings arent already pooled', type=str) parser.add_argument('--vec_id', help='Reference to a specific embedding type') parser.add_argument('--embed_id', help='What type of embeddings to use') parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument('--max_len1d', type=int, default=100) parser.add_argument('--embeddings', help='index of embeddings: local file, remote URL or mead-ml/hub ref', type=convert_path) parser.add_argument('--vecs', help='index of vectorizers: local file, remote URL or hub mead-ml/ref', type=convert_path) parser.add_argument('--fast', help="Run fast, but not necessarily as accurate", action='store_true') parser.add_argument('--data', help="Path to senteval data", default=os.path.expanduser("~/dev/work/SentEval/data")) args = parser.parse_args() if args.module: logger.warning("Loading custom user module %s for masking rules and tokenizers", args.module) baseline.import_user_module(args.module) tokenizer = create_tokenizer(args.tok) if args.tok else None args.embeddings = convert_path(DEFAULT_EMBEDDINGS_LOC) if args.embeddings is None else args.embeddings args.embeddings = read_config_stream(args.embeddings) args.vecs = convert_path(DEFAULT_VECTORIZERS_LOC) if args.vecs is None else args.vecs vecs_index = read_config_stream(args.vecs) vecs_set = index_by_label(vecs_index) vec_params = vecs_set[args.vec_id] vec_params['mxlen'] = args.nctx if 'transform' in vec_params: vec_params['transform_fn'] = vec_params['transform'] if 'transform_fn' in vec_params and isinstance(vec_params['transform_fn'], str): vec_params['transform_fn'] = eval(vec_params['transform_fn']) vectorizer = create_vectorizer(**vec_params) if not isinstance(vectorizer, HasPredefinedVocab): raise Exception("We currently require a vectorizer with a pre-defined vocab to run this script") embeddings_index = read_config_stream(args.embeddings) embeddings_set = index_by_label(embeddings_index) embeddings_params = embeddings_set[args.embed_id] embeddings = load_embeddings_overlay(embeddings_set, embeddings_params, vectorizer.vocab) embedder = embeddings['embeddings'] embedder.to(args.device).eval() def _mean_pool(inputs, embeddings): mask = (inputs != 0) seq_lengths = mask.sum(1).unsqueeze(-1) return embeddings.sum(1)/seq_lengths def _zero_tok_pool(_, embeddings): pooled = embeddings[:, 0] return pooled def _max_pool(inputs, embeddings): mask = (inputs != 0) embeddings = embeddings.masked_fill(mask.unsqueeze(-1) == False, -1e8) return torch.max(embeddings, 1, False)[0] if args.pool: if args.pool == 'max': pool = _max_pool elif args.pool == 'zero' or args.pool == 'cls': pool = _zero_tok_pool else: pool = _mean_pool else: pool = lambda x, y: y params_senteval = {'task_path': args.data, 'usepytorch': True, 'kfold': 10} params_senteval['classifier'] = {'nhid': 0, 'optim': 'adam', 'batch_size': 64, 'tenacity': 5, 'epoch_size': 4} if args.fast: logging.info("Setting fast params") params_senteval['kfold'] = 5 params_senteval['classifier']['epoch_size'] = 2 params_senteval['classifier']['tenacity'] = 3 params_senteval['classifier']['batch_size'] = 128 # SentEval prepare and batcher def prepare(params, samples): max_sample = max(len(s) for s in samples) vectorizer.mxlen = min(args.nctx, max_sample + SUBWORD_EXTRA) logging.info('num_samples %d, mxlen set to %d', max_sample, vectorizer.mxlen) def batcher(params, batch): if not tokenizer: batch = [sent if sent != [] else ['.'] for sent in batch] else: batch = [tokenizer(' '.join(sent)) for sent in batch] vs = [] for sent in batch: v, l = vectorizer.run(sent, vectorizer.vocab) vs.append(v) vs = np.stack(vs) with torch.no_grad(): inputs = torch.tensor(vs, device=args.device) encoding = embedder(inputs) encoding = pool(inputs, encoding) encoding = encoding.cpu().numpy() return encoding se = senteval.engine.SE(params_senteval, batcher, prepare) transfer_tasks = [] if 'sts' in args.tasks: transfer_tasks += ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'SICKRelatedness', 'STSBenchmark'] if 'class' in args.tasks: transfer_tasks += ['MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', 'SICKEntailment'] if 'probe' in args.tasks: transfer_tasks += ['Length', 'WordContent', 'Depth', 'TopConstituents', 'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber', 'OddManOut', 'CoordinationInversion'] results = se.eval(transfer_tasks) print(results)