def test_file_exists(file_name): start = "/dev" gold = file_name wrong = os.path.join(start, file_name) path = convert_path(file_name, start) assert path == gold assert path != wrong
def test_no_loc(): file_name = "test" gold = os.path.realpath( os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "mead", file_name)) path = convert_path(file_name) assert path == gold
def test_loc(): file_name = "test" start = "/dev" gold = os.path.join(start, file_name) path = convert_path(file_name, start) assert path == gold
def main(): parser = argparse.ArgumentParser(description='Train a text classifier') parser.add_argument( '--config', help= 'JSON/YML Configuration for an experiment: local file or remote URL', type=convert_path, default="$MEAD_CONFIG") parser.add_argument('--settings', help='JSON/YML Configuration for mead', default=DEFAULT_SETTINGS_LOC, type=convert_path) parser.add_argument('--task_modules', help='tasks to load, must be local', default=[], nargs='+', required=False) parser.add_argument( '--datasets', help= 'index of dataset labels: local file, remote URL or mead-ml/hub ref', type=convert_path) parser.add_argument( '--modules', help='modules to load: local files, remote URLs or mead-ml/hub refs', default=[], nargs='+', required=False) parser.add_argument('--mod_train_file', help='override the training set') parser.add_argument('--mod_valid_file', help='override the validation set') parser.add_argument('--mod_test_file', help='override the test set') parser.add_argument('--fit_func', help='override the fit function') parser.add_argument( '--embeddings', help='index of embeddings: local file, remote URL or mead-ml/hub ref', type=convert_path) parser.add_argument( '--vecs', help='index of vectorizers: local file, remote URL or hub mead-ml/ref', type=convert_path) parser.add_argument('--logging', help='json file for logging', default=DEFAULT_LOGGING_LOC, type=convert_path) parser.add_argument('--task', help='task to run', choices=['classify', 'tagger', 'seq2seq', 'lm']) parser.add_argument('--gpus', help='Number of GPUs (defaults to number available)', type=int, default=-1) parser.add_argument( '--basedir', help='Override the base directory where models are stored', type=str) parser.add_argument('--reporting', help='reporting hooks', nargs='+') parser.add_argument('--backend', help='The deep learning backend to use') parser.add_argument('--checkpoint', help='Restart training from this checkpoint') parser.add_argument( '--prefer_eager', help="If running in TensorFlow, should we prefer eager model", type=str2bool) args, overrides = parser.parse_known_args() config_params = read_config_stream(args.config) config_params = parse_and_merge_overrides(config_params, overrides, pre='x') if args.basedir is not None: config_params['basedir'] = args.basedir # task_module overrides are not allowed via hub or HTTP, must be defined locally for task in args.task_modules: import_user_module(task) task_name = config_params.get( 'task', 'classify') if args.task is None else args.task args.logging = read_config_stream(args.logging) configure_logger(args.logging, config_params.get('basedir', './{}'.format(task_name))) try: args.settings = read_config_stream(args.settings) except: logger.warning( 'Warning: no mead-settings file was found at [{}]'.format( args.settings)) args.settings = {} args.datasets = args.settings.get( 'datasets', convert_path( DEFAULT_DATASETS_LOC)) if args.datasets is None else args.datasets args.datasets = read_config_stream(args.datasets) if args.mod_train_file or args.mod_valid_file or args.mod_test_file: logging.warning( 'Warning: overriding the training/valid/test data with user-specified files' ' different from what was specified in the dataset index. Creating a new key for this entry' ) update_datasets(args.datasets, config_params, args.mod_train_file, args.mod_valid_file, args.mod_test_file) args.embeddings = args.settings.get( 'embeddings', convert_path(DEFAULT_EMBEDDINGS_LOC) ) if args.embeddings is None else args.embeddings args.embeddings = read_config_stream(args.embeddings) args.vecs = args.settings.get('vecs', convert_path( DEFAULT_VECTORIZERS_LOC)) if args.vecs is None else args.vecs args.vecs = read_config_stream(args.vecs) if args.gpus: # why does it go to model and not to train? config_params['train']['gpus'] = args.gpus if args.fit_func: config_params['train']['fit_func'] = args.fit_func if args.backend: config_params['backend'] = normalize_backend(args.backend) config_params['modules'] = list( set(chain(config_params.get('modules', []), args.modules))) cmd_hooks = args.reporting if args.reporting is not None else [] config_hooks = config_params.get('reporting') if config_params.get( 'reporting') is not None else [] reporting = parse_extra_args(set(chain(cmd_hooks, config_hooks)), overrides) config_params['reporting'] = reporting logger.info('Task: [{}]'.format(task_name)) task = mead.Task.get_task_specific(task_name, args.settings) task.read_config(config_params, args.datasets, args.vecs, reporting_args=overrides, prefer_eager=args.prefer_eager) task.initialize(args.embeddings) task.train(args.checkpoint)
def main(): parser = argparse.ArgumentParser( description='Encode a sentence as an embedding') parser.add_argument('--subword_model_file', help='Subword model file') parser.add_argument('--nctx', default=256, type=int) parser.add_argument('--batchsz', default=20, type=int) parser.add_argument('--vec_id', default='bert-base-uncased', help='Reference to a specific embedding type') parser.add_argument('--embed_id', default='bert-base-uncased', help='What type of embeddings to use') parser.add_argument('--file', required=True) parser.add_argument('--column', type=str) parser.add_argument('--output', default='embeddings.npz') parser.add_argument( '--pool', help= 'Should a reduction be applied on the embeddings? Only use if your embeddings arent already pooled', type=str) parser.add_argument( '--embeddings', help='index of embeddings: local file, remote URL or mead-ml/hub ref', type=convert_path) parser.add_argument( '--vecs', help='index of vectorizers: local file, remote URL or hub mead-ml/ref', type=convert_path) parser.add_argument('--cuda', type=baseline.str2bool, default=True) parser.add_argument('--has_header', action="store_true") parser.add_argument( "--tokenizer_type", type=str, help="Optional tokenizer, default is to use string split") parser.add_argument( '--faiss_index', help="If provided, we will build a FAISS index and store it here") parser.add_argument( '--quoting', default=3, help='0=QUOTE_MINIMAL 1=QUOTE_ALL 2=QUOTE_NONNUMERIC 3=QUOTE_NONE', type=int) parser.add_argument('--sep', default='\t') parser.add_argument('--add_columns', nargs='+', default=[]) args = parser.parse_args() if not args.has_header: if not args.column: args.column = 0 if args.add_columns: args.add_columns = [int(c) for c in args.add_columns] column = int(args.column) else: column = args.column args.embeddings = convert_path( DEFAULT_EMBEDDINGS_LOC) if args.embeddings is None else args.embeddings args.embeddings = read_config_stream(args.embeddings) args.vecs = convert_path( DEFAULT_VECTORIZERS_LOC) if args.vecs is None else args.vecs vecs_index = read_config_stream(args.vecs) vecs_set = index_by_label(vecs_index) vec_params = vecs_set[args.vec_id] vec_params['mxlen'] = args.nctx if 'transform' in vec_params: vec_params['transform_fn'] = vec_params['transform'] if 'transform_fn' in vec_params and isinstance(vec_params['transform_fn'], str): vec_params['transform_fn'] = eval(vec_params['transform_fn']) tokenizer = create_tokenizer(args.tokenizer_type) vectorizer = create_vectorizer(**vec_params) if not isinstance(vectorizer, HasPredefinedVocab): raise Exception( "We currently require a vectorizer with a pre-defined vocab to run this script" ) embeddings_index = read_config_stream(args.embeddings) embeddings_set = index_by_label(embeddings_index) embeddings_params = embeddings_set[args.embed_id] # If they dont want CUDA try and get the embedding loader to use CPU embeddings_params['cpu_placement'] = not args.cuda embeddings = load_embeddings_overlay(embeddings_set, embeddings_params, vectorizer.vocab) vocabs = {'x': embeddings['vocab']} embedder = embeddings['embeddings'].cpu() embedder.eval() if args.cuda: embedder = embedder.cuda() def _mean_pool(inputs, embeddings): mask = (inputs != 0) seq_lengths = mask.sum(1).unsqueeze(-1) return embeddings.sum(1) / seq_lengths def _zero_tok_pool(_, embeddings): pooled = embeddings[:, 0] return pooled def _max_pool(inputs, embeddings): mask = (inputs != 0) embeddings = embeddings.masked_fill(mask.unsqueeze(-1) == False, -1e8) return torch.max(embeddings, 1, False)[0] if args.pool: if args.pool == 'max': pool = _max_pool elif args.pool == 'zero' or args.pool == 'cls': pool = _zero_tok_pool else: pool = _mean_pool else: pool = lambda x, y: y def chunks(lst, n): """Yield successive n-sized chunks from lst.""" for i in range(0, len(lst), n): yield lst[i:i + n] df = pd.read_csv(args.file, header='infer' if args.has_header else None, sep=args.sep) col = df[column] batches = [] as_list = col.tolist() extra_col_map = {} for extra_col in args.add_columns: if isinstance(extra_col, int): key = f'column_{extra_col}' else: key = extra_col extra_col_map[key] = df[extra_col].tolist() num_batches = math.ceil(len(as_list) / args.batchsz) pg = baseline.create_progress_bar(num_batches, name='tqdm') for i, batch in enumerate(chunks(as_list, args.batchsz)): pg.update() with torch.no_grad(): vecs = [] for line in batch: tokenized = tokenizer(line) vec, l = vectorizer.run(tokenized, vocabs['x']) vecs.append(vec) vecs = torch.tensor(np.stack(vecs)) if args.cuda: vecs = vecs.cuda() embedding = embedder(vecs) pooled_batch = pool(vecs, embedding).cpu().numpy() batches += [x for x in pooled_batch] np.savez(args.output, embeddings=batches, text=as_list, **extra_col_map) if args.faiss_index: import faiss index = faiss.IndexFlatIP(batches[0].shape[-1]) batches = np.stack(batches) faiss.normalize_L2(batches) index.add(batches) faiss.write_index(index, args.faiss_index)
def main(): parser = argparse.ArgumentParser(description='Train a text classifier') parser.add_argument('--config', help='configuration for an experiment', type=convert_path, default="$MEAD_CONFIG") parser.add_argument('--settings', help='configuration for mead', default=DEFAULT_SETTINGS_LOC, type=convert_path) parser.add_argument('--datasets', help='index of dataset labels', type=convert_path) parser.add_argument('--modules', help='modules to load', default=[], nargs='+', required=False) parser.add_argument('--mod_train_file', help='override the training set') parser.add_argument('--mod_valid_file', help='override the validation set') parser.add_argument('--mod_test_file', help='override the test set') parser.add_argument('--embeddings', help='index of embeddings', type=convert_path) parser.add_argument('--logging', help='config file for logging', default=DEFAULT_LOGGING_LOC, type=convert_path) parser.add_argument('--task', help='task to run', choices=['classify', 'tagger', 'seq2seq', 'lm']) parser.add_argument('--gpus', help='Number of GPUs (defaults to number available)', type=int, default=-1) parser.add_argument( '--basedir', help='Override the base directory where models are stored', type=str) parser.add_argument('--reporting', help='reporting hooks', nargs='+') parser.add_argument('--backend', help='The deep learning backend to use') parser.add_argument('--checkpoint', help='Restart training from this checkpoint') args, reporting_args = parser.parse_known_args() config_params = read_config_stream(args.config) if args.basedir is not None: config_params['basedir'] = args.basedir task_name = config_params.get( 'task', 'classify') if args.task is None else args.task args.logging = read_config_stream(args.logging) configure_logger(args.logging, config_params.get('basedir', './{}'.format(task_name))) try: args.settings = read_config_stream(args.settings) except: logger.warning( 'Warning: no mead-settings file was found at [{}]'.format( args.settings)) args.settings = {} args.datasets = args.datasets if args.datasets else args.settings.get( 'datasets', convert_path(DEFAULT_DATASETS_LOC)) args.datasets = read_config_stream(args.datasets) if args.mod_train_file or args.mod_valid_file or args.mod_test_file: logging.warning( 'Warning: overriding the training/valid/test data with user-specified files' ' different from what was specified in the dataset index. Creating a new key for this entry' ) update_datasets(args.datasets, config_params, args.mod_train_file, args.mod_valid_file, args.mod_test_file) args.embeddings = args.embeddings if args.embeddings else args.settings.get( 'embeddings', convert_path(DEFAULT_EMBEDDINGS_LOC)) args.embeddings = read_config_stream(args.embeddings) if args.gpus is not None: config_params['model']['gpus'] = args.gpus if args.backend is None and 'backend' in args.settings: args.backend = args.settings['backend'] if args.backend is not None: config_params['backend'] = normalize_backend(args.backend) config_params['modules'] = list( set(chain(config_params.get('modules', []), args.modules))) cmd_hooks = args.reporting if args.reporting is not None else [] config_hooks = config_params.get('reporting') if config_params.get( 'reporting') is not None else [] reporting = parse_extra_args(set(chain(cmd_hooks, config_hooks)), reporting_args) config_params['reporting'] = reporting logger.info('Task: [{}]'.format(task_name)) task = mead.Task.get_task_specific(task_name, args.settings) task.read_config(config_params, args.datasets, reporting_args=reporting_args) task.initialize(args.embeddings) task.train(args.checkpoint)
parser.add_argument('--max_len1d', type=int, default=100) parser.add_argument( '--embeddings', help='index of embeddings: local file, remote URL or mead-ml/hub ref', type=convert_path) parser.add_argument( '--vecs', help='index of vectorizers: local file, remote URL or hub mead-ml/ref', type=convert_path) parser.add_argument('--cuda', type=baseline.str2bool, default=True) parser.add_argument('--has_header', type=baseline.str2bool, default=True) parser.add_argument('--sep', default='\t') args = parser.parse_args() args.embeddings = convert_path( DEFAULT_EMBEDDINGS_LOC) if args.embeddings is None else args.embeddings args.embeddings = read_config_stream(args.embeddings) args.vecs = convert_path( DEFAULT_VECTORIZERS_LOC) if args.vecs is None else args.vecs vecs_index = read_config_stream(args.vecs) vecs_set = index_by_label(vecs_index) vec_params = vecs_set[args.vec_id] vec_params['mxlen'] = args.nctx if 'transform' in vec_params: vec_params['transform_fn'] = vec_params['transform'] if 'transform_fn' in vec_params and isinstance(vec_params['transform_fn'], str):
def main(): parser = argparse.ArgumentParser(description='Run senteval harness') parser.add_argument('--nctx', default=512, type=int) parser.add_argument("--module", default=None, help="Module containing custom tokenizers") parser.add_argument('--tasks', nargs="+", default=['sts', 'class', 'probe']) parser.add_argument('--batchsz', default=20, type=int) parser.add_argument('--tok', help='Optional tokenizer, e.g. "gpt2" or "basic". These can be defined in extra module') parser.add_argument('--pool', help='Should a reduction be applied on the embeddings? Only use if your embeddings arent already pooled', type=str) parser.add_argument('--vec_id', help='Reference to a specific embedding type') parser.add_argument('--embed_id', help='What type of embeddings to use') parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument('--max_len1d', type=int, default=100) parser.add_argument('--embeddings', help='index of embeddings: local file, remote URL or mead-ml/hub ref', type=convert_path) parser.add_argument('--vecs', help='index of vectorizers: local file, remote URL or hub mead-ml/ref', type=convert_path) parser.add_argument('--fast', help="Run fast, but not necessarily as accurate", action='store_true') parser.add_argument('--data', help="Path to senteval data", default=os.path.expanduser("~/dev/work/SentEval/data")) args = parser.parse_args() if args.module: logger.warning("Loading custom user module %s for masking rules and tokenizers", args.module) baseline.import_user_module(args.module) tokenizer = create_tokenizer(args.tok) if args.tok else None args.embeddings = convert_path(DEFAULT_EMBEDDINGS_LOC) if args.embeddings is None else args.embeddings args.embeddings = read_config_stream(args.embeddings) args.vecs = convert_path(DEFAULT_VECTORIZERS_LOC) if args.vecs is None else args.vecs vecs_index = read_config_stream(args.vecs) vecs_set = index_by_label(vecs_index) vec_params = vecs_set[args.vec_id] vec_params['mxlen'] = args.nctx if 'transform' in vec_params: vec_params['transform_fn'] = vec_params['transform'] if 'transform_fn' in vec_params and isinstance(vec_params['transform_fn'], str): vec_params['transform_fn'] = eval(vec_params['transform_fn']) vectorizer = create_vectorizer(**vec_params) if not isinstance(vectorizer, HasPredefinedVocab): raise Exception("We currently require a vectorizer with a pre-defined vocab to run this script") embeddings_index = read_config_stream(args.embeddings) embeddings_set = index_by_label(embeddings_index) embeddings_params = embeddings_set[args.embed_id] embeddings = load_embeddings_overlay(embeddings_set, embeddings_params, vectorizer.vocab) embedder = embeddings['embeddings'] embedder.to(args.device).eval() def _mean_pool(inputs, embeddings): mask = (inputs != 0) seq_lengths = mask.sum(1).unsqueeze(-1) return embeddings.sum(1)/seq_lengths def _zero_tok_pool(_, embeddings): pooled = embeddings[:, 0] return pooled def _max_pool(inputs, embeddings): mask = (inputs != 0) embeddings = embeddings.masked_fill(mask.unsqueeze(-1) == False, -1e8) return torch.max(embeddings, 1, False)[0] if args.pool: if args.pool == 'max': pool = _max_pool elif args.pool == 'zero' or args.pool == 'cls': pool = _zero_tok_pool else: pool = _mean_pool else: pool = lambda x, y: y params_senteval = {'task_path': args.data, 'usepytorch': True, 'kfold': 10} params_senteval['classifier'] = {'nhid': 0, 'optim': 'adam', 'batch_size': 64, 'tenacity': 5, 'epoch_size': 4} if args.fast: logging.info("Setting fast params") params_senteval['kfold'] = 5 params_senteval['classifier']['epoch_size'] = 2 params_senteval['classifier']['tenacity'] = 3 params_senteval['classifier']['batch_size'] = 128 # SentEval prepare and batcher def prepare(params, samples): max_sample = max(len(s) for s in samples) vectorizer.mxlen = min(args.nctx, max_sample + SUBWORD_EXTRA) logging.info('num_samples %d, mxlen set to %d', max_sample, vectorizer.mxlen) def batcher(params, batch): if not tokenizer: batch = [sent if sent != [] else ['.'] for sent in batch] else: batch = [tokenizer(' '.join(sent)) for sent in batch] vs = [] for sent in batch: v, l = vectorizer.run(sent, vectorizer.vocab) vs.append(v) vs = np.stack(vs) with torch.no_grad(): inputs = torch.tensor(vs, device=args.device) encoding = embedder(inputs) encoding = pool(inputs, encoding) encoding = encoding.cpu().numpy() return encoding se = senteval.engine.SE(params_senteval, batcher, prepare) transfer_tasks = [] if 'sts' in args.tasks: transfer_tasks += ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'SICKRelatedness', 'STSBenchmark'] if 'class' in args.tasks: transfer_tasks += ['MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', 'SICKEntailment'] if 'probe' in args.tasks: transfer_tasks += ['Length', 'WordContent', 'Depth', 'TopConstituents', 'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber', 'OddManOut', 'CoordinationInversion'] results = se.eval(transfer_tasks) print(results)