class LanguageModelingTask(Task): def __init__(self, mead_settings_config, **kwargs): super(LanguageModelingTask, self).__init__(mead_settings_config, **kwargs) @classmethod def task_name(cls): return 'lm' def _create_task_specific_reader(self): self._create_vectorizers() reader_params = self.config_params['reader'] if 'reader' in self.config_params else self.config_params['loader'] reader_params['nctx'] = reader_params.get('nctx', self.config_params.get('nctx', self.config_params.get('nbptt', 35))) reader_params['clean_fn'] = reader_params.get('clean_fn', self.config_params.get('preproc', {}).get('clean_fn')) if reader_params['clean_fn'] is not None and self.config_params['dataset'] != 'SST2': logger.warning('Warning: A reader preprocessing function (%s) is active, it is recommended that all data preprocessing is done outside of baseline to insure data at inference time matches data at training time.', reader_params['clean_fn']) reader_params['mxlen'] = self.vectorizers[self.primary_key].mxlen if self.config_params['model'].get('gpus', 1) > 1: reader_params['truncate'] = True return baseline.reader.create_reader(self.task_name(), self.vectorizers, self.config_params['preproc'].get('trim', False), **reader_params) def _create_backend(self, **kwargs): backend = Backend(self.config_params.get('backend', 'tf')) if backend.name == 'pytorch': self.config_params.get('preproc', {})['trim'] = True elif backend.name == 'dy': self.config_params.get('preproc', {})['trim'] = True import _dynet dy_params = _dynet.DynetParams() dy_params.from_args() dy_params.set_requested_gpus(1) if 'autobatchsz' in self.config_params['train']: dy_params.set_autobatch(True) batched = False else: batched = True dy_params.init() backend.params = {'pc': _dynet.ParameterCollection(), 'batched': batched} backend.load(self.task_name()) return backend def initialize(self, embeddings): embeddings = read_config_file_or_json(embeddings, 'embeddings') embeddings_set = index_by_label(embeddings) self.dataset = DataDownloader(self.dataset, self.data_download_cache).download() print_dataset_info(self.dataset) vocab_sources = [self.dataset['train_file'], self.dataset['valid_file']] # TODO: make this optional if 'test_file' in self.dataset: vocab_sources.append(self.dataset['test_file']) vocabs = self.reader.build_vocab(vocab_sources, min_f=Task._get_min_f(self.config_params), vocab_file=self.dataset.get('vocab_file')) self.embeddings, self.feat2index = self._create_embeddings(embeddings_set, vocabs, self.config_params['features']) baseline.save_vocabs(self.get_basedir(), self.feat2index) def _load_dataset(self): read = self.config_params['reader'] if 'reader' in self.config_params else self.config_params['loader'] tgt_key = read.get('tgt_key', self.primary_key) bsz, vbsz, tbsz = Task._get_batchsz(self.config_params) self.train_data = self.reader.load( self.dataset['train_file'], self.feat2index, bsz, tgt_key=tgt_key ) self.valid_data = self.reader.load( self.dataset['valid_file'], self.feat2index, vbsz, tgt_key=tgt_key ) self.test_data = None if 'test_file' in self.dataset: self.test_data = self.reader.load( self.dataset['test_file'], self.feat2index, 1, tgt_key=tgt_key ) def _create_model(self): model = self.config_params['model'] unif = self.config_params.get('unif', 0.1) model['unif'] = model.get('unif', unif) model['batchsz'] = self.config_params['batchsz'] model['tgt_key'] = self.config_params.get('reader', self.config_params.get('loader', {})).get('tgt_key', self.primary_key) model['src_keys'] = listify(self.config_params.get('reader', list(self.config_params.get('loader', {}).get('src_keys', self.embeddings.keys())))) if self.backend.params is not None: for k, v in self.backend.params.items(): model[k] = v return baseline.model.create_lang_model(self.embeddings, **model) def train(self, checkpoint=None): self._load_dataset() if self.config_params['train'].get('lr_scheduler_type', None) == 'zaremba': first_range = int(self.config_params['train']['start_decay_epoch'] * self.train_data.steps) self.config_params['train']['bounds'] = [first_range] + list( np.arange( self.config_params['train']['start_decay_epoch'] + 1, self.config_params['train']['epochs'] + 1, dtype=np.int32 ) * self.train_data.steps ) baseline.save_vectorizers(self.get_basedir(), self.vectorizers) model = self._create_model() train_params = self.config_params['train'] train_params['checkpoint'] = checkpoint metrics = baseline.train.fit(model, self.train_data, self.valid_data, self.test_data, **train_params) baseline.zip_files(self.get_basedir()) self._close_reporting_hooks() return model, metrics @staticmethod def _num_steps_per_epoch(num_examples, nctx, batchsz): rest = num_examples // batchsz return rest // nctx
class TaggerTask(Task): def __init__(self, mead_settings_config, **kwargs): super(TaggerTask, self).__init__(mead_settings_config, **kwargs) @classmethod def task_name(cls): return 'tagger' def _create_backend(self, **kwargs): backend = Backend(self.config_params.get('backend', 'tf')) if 'preproc' not in self.config_params: self.config_params['preproc'] = {} if backend.name == 'pytorch': self.config_params['preproc']['trim'] = True elif backend.name == 'dy': import _dynet dy_params = _dynet.DynetParams() dy_params.from_args() dy_params.set_requested_gpus(1) if 'autobatchsz' in self.config_params['train']: dy_params.set_autobatch(True) else: raise Exception('Tagger currently only supports autobatching.' 'Change "batchsz" to 1 and under "train", set "autobatchsz" to your desired batchsz') dy_params.init() backend.params = {'pc': _dynet.ParameterCollection(), 'batched': False} self.config_params['preproc']['trim'] = True else: self.config_params['preproc']['trim'] = False backend.load(self.task_name()) return backend def initialize(self, embeddings): self.dataset = DataDownloader(self.dataset, self.data_download_cache).download() print_dataset_info(self.dataset) embeddings = read_config_file_or_json(embeddings, 'embeddings') embeddings_set = index_by_label(embeddings) vocab_sources = [self.dataset['train_file'], self.dataset['valid_file']] # TODO: make this optional if 'test_file' in self.dataset: vocab_sources.append(self.dataset['test_file']) vocabs = self.reader.build_vocab(vocab_sources, min_f=Task._get_min_f(self.config_params), vocab_file =self.dataset.get('vocab_file')) self.embeddings, self.feat2index = self._create_embeddings(embeddings_set, vocabs, self.config_params['features']) baseline.save_vocabs(self.get_basedir(), self.feat2index) def _create_model(self): labels = self.reader.label2index span_type = self.config_params['train'].get('span_type') constrain = bool(self.config_params['model'].get('constrain_decode', False)) if span_type is None and constrain: logger.warning("Constrained Decoding was set but no span type could be found so no Constraints will be applied.") self.config_params['model']['span_type'] = span_type if span_type is not None and constrain: self.config_params['model']['constraint'] = self.backend.transition_mask( labels, span_type, Offsets.GO, Offsets.EOS, Offsets.PAD ) model = self.config_params['model'] unif = self.config_params.get('unif', 0.1) model['unif'] = model.get('unif', unif) lengths_key = model.get('lengths_key', self.primary_key) if lengths_key is not None: if not lengths_key.endswith('_lengths'): lengths_key = '{}_lengths'.format(lengths_key) model['lengths_key'] = lengths_key if self.backend.params is not None: for k, v in self.backend.params.items(): model[k] = v return baseline.model.create_tagger_model(self.embeddings, labels, **self.config_params['model']) def _load_dataset(self): # TODO: get rid of sort_key=self.primary_key in favor of something explicit? bsz, vbsz, tbsz = Task._get_batchsz(self.config_params) self.train_data, _ = self.reader.load( self.dataset['train_file'], self.feat2index, bsz, shuffle=True, sort_key='{}_lengths'.format(self.primary_key) ) self.valid_data, _ = self.reader.load( self.dataset['valid_file'], self.feat2index, vbsz, sort_key=None ) self.test_data = None self.txts = None if 'test_file' in self.dataset: self.test_data, self.txts = self.reader.load( self.dataset['test_file'], self.feat2index, tbsz, shuffle=False, sort_key=None ) def train(self, checkpoint=None): self._load_dataset() baseline.save_vectorizers(self.get_basedir(), self.vectorizers) model = self._create_model() conll_output = self.config_params.get("conll_output", None) train_params = self.config_params['train'] train_params['checkpoint'] = checkpoint metrics = baseline.train.fit(model, self.train_data, self.valid_data, self.test_data, conll_output=conll_output, txts=self.txts, **train_params) baseline.zip_files(self.get_basedir()) self._close_reporting_hooks() return model, metrics
class EncoderDecoderTask(Task): def __init__(self, mead_settings_config, **kwargs): super(EncoderDecoderTask, self).__init__(mead_settings_config, **kwargs) @classmethod def task_name(cls): return 'seq2seq' def _create_backend(self, **kwargs): backend = Backend(self.config_params.get('backend', 'tf')) if 'preproc' not in self.config_params: self.config_params['preproc'] = {} self.config_params['preproc']['show_ex'] = show_examples if backend.name == 'pytorch': self.config_params['preproc']['trim'] = True elif backend.name == 'dy': import _dynet dy_params = _dynet.DynetParams() dy_params.from_args() dy_params.set_requested_gpus(1) if 'autobatchsz' in self.config_params['train']: self.config_params['train']['trainer_type'] = 'autobatch' dy_params.set_autobatch(True) batched = False else: batched = True dy_params.init() backend.params = {'pc': _dynet.ParameterCollection(), 'batched': batched} self.config_params['preproc']['trim'] = True else: self.config_params['preproc']['trim'] = True backend.load(self.task_name()) return backend def initialize(self, embeddings): embeddings = read_config_file_or_json(embeddings, 'embeddings') embeddings_set = index_by_label(embeddings) self.dataset = DataDownloader(self.dataset, self.data_download_cache).download() print_dataset_info(self.dataset) vocab_sources = [self.dataset['train_file'], self.dataset['valid_file']] # TODO: make this optional if 'test_file' in self.dataset: vocab_sources.append(self.dataset['test_file']) vocab1, vocab2 = self.reader.build_vocabs(vocab_sources, min_f=Task._get_min_f(self.config_params), vocab_file=self.dataset.get('vocab_file')) # To keep the config file simple, share a list between source and destination (tgt) features_src = [] features_tgt = None for feature in self.config_params['features']: if feature['name'] == 'tgt': features_tgt = feature else: features_src += [feature] self.src_embeddings, self.feat2src = self._create_embeddings(embeddings_set, vocab1, features_src) # For now, dont allow multiple vocabs of output baseline.save_vocabs(self.get_basedir(), self.feat2src) self.tgt_embeddings, self.feat2tgt = self._create_embeddings(embeddings_set, {'tgt': vocab2}, [features_tgt]) baseline.save_vocabs(self.get_basedir(), self.feat2tgt) self.tgt_embeddings = self.tgt_embeddings['tgt'] self.feat2tgt = self.feat2tgt['tgt'] def _load_dataset(self): bsz, vbsz, tbsz = Task._get_batchsz(self.config_params) self.train_data = self.reader.load( self.dataset['train_file'], self.feat2src, self.feat2tgt, bsz, shuffle=True, sort_key='{}_lengths'.format(self.primary_key) ) self.valid_data = self.reader.load( self.dataset['valid_file'], self.feat2src, self.feat2tgt, vbsz, shuffle=True ) self.test_data = None if 'test_file' in self.dataset: self.test_data = self.reader.load( self.dataset['test_file'], self.feat2src, self.feat2tgt, tbsz, ) def _create_model(self): self.config_params['model']["unif"] = self.config_params["unif"] model = self.config_params['model'] unif = self.config_params.get('unif', 0.1) model['unif'] = model.get('unif', unif) lengths_key = model.get('src_lengths_key', self.primary_key) if lengths_key is not None: if not lengths_key.endswith('_lengths'): lengths_key = '{}_lengths'.format(lengths_key) model['src_lengths_key'] = lengths_key if self.backend.params is not None: for k, v in self.backend.params.items(): model[k] = v return baseline.model.create_seq2seq_model(self.src_embeddings, self.tgt_embeddings, **self.config_params['model']) def train(self, checkpoint=None): num_ex = self.config_params['num_valid_to_show'] rlut1 = revlut(self.feat2src[self.primary_key]) rlut2 = revlut(self.feat2tgt) if num_ex > 0: logger.info('Showing examples') preproc = self.config_params.get('preproc', {}) show_ex_fn = preproc['show_ex'] self.config_params['train']['after_train_fn'] = lambda model: show_ex_fn(model, self.valid_data, rlut1, rlut2, self.feat2tgt, preproc['mxlen'], False, 0, num_ex, reverse=False) self.config_params['train']['tgt_rlut'] = rlut2 return super(EncoderDecoderTask, self).train(checkpoint)
class EncoderDecoderTask(Task): def __init__(self, logging_file, mead_config, **kwargs): super(EncoderDecoderTask, self).__init__(logging_file, mead_config, **kwargs) self.task = None def _create_task_specific_reader(self): preproc = self.config_params['preproc'] reader = baseline.create_parallel_corpus_reader( preproc['mxlen'], preproc['vec_alloc'], preproc['trim'], preproc['word_trans_fn'], **self.config_params['loader']) return reader def _setup_task(self): # If its not vanilla seq2seq, dont bother reversing do_reverse = self.config_params['model']['model_type'] == 'default' backend = self.config_params.get('backend', 'tensorflow') if backend == 'pytorch': print('PyTorch backend') from baseline.pytorch import long_0_tensor_alloc as vec_alloc from baseline.pytorch import tensor_shape as vec_shape from baseline.pytorch import tensor_reverse_2nd as rev2nd import baseline.pytorch.seq2seq as seq2seq self.config_params['preproc']['vec_alloc'] = vec_alloc self.config_params['preproc']['vec_shape'] = vec_shape src_vec_trans = rev2nd if do_reverse else None self.config_params['preproc']['word_trans_fn'] = src_vec_trans self.config_params['preproc'][ 'show_ex'] = baseline.pytorch.show_examples_pytorch self.config_params['preproc']['trim'] = True else: import baseline.tf.seq2seq as seq2seq import mead.tf self.ExporterType = mead.tf.Seq2SeqTensorFlowExporter self.config_params['preproc']['vec_alloc'] = np.zeros self.config_params['preproc']['vec_shape'] = np.shape self.config_params['preproc']['trim'] = False src_vec_trans = baseline.reverse_2nd if do_reverse else None self.config_params['preproc']['word_trans_fn'] = src_vec_trans self.config_params['preproc'][ 'show_ex'] = baseline.tf.show_examples_tf self.task = seq2seq def initialize(self, embeddings): embeddings_set = mead.utils.index_by_label(embeddings) self.dataset = DataDownloader(self.dataset, self.data_download_cache, True).download() print( "[train file]: {}\n[valid file]: {}\n[test file]: {}\n[vocab file]: {}" .format(self.dataset['train_file'], self.dataset['valid_file'], self.dataset['test_file'], self.dataset.get('vocab_file', "None"))) vocab_file = self.dataset.get('vocab_file', None) if vocab_file is not None: vocab1, vocab2 = self.reader.build_vocabs([vocab_file]) else: vocab1, vocab2 = self.reader.build_vocabs([ self.dataset['train_file'], self.dataset['valid_file'], self.dataset['test_file'] ]) self.embeddings1, self.feat2index1 = self._create_embeddings( embeddings_set, {'word': vocab1}) self.embeddings2, self.feat2index2 = self._create_embeddings( embeddings_set, {'word': vocab2}) def _load_dataset(self): self.train_data = self.reader.load(self.dataset['train_file'], self.feat2index1['word'], self.feat2index2['word'], self.config_params['batchsz'], shuffle=True) self.valid_data = self.reader.load(self.dataset['valid_file'], self.feat2index1['word'], self.feat2index2['word'], self.config_params['batchsz'], shuffle=True) self.test_data = self.reader.load( self.dataset['test_file'], self.feat2index1['word'], self.feat2index2['word'], self.config_params.get('test_batchsz', 1)) def _create_model(self): return self.task.create_model(self.embeddings1['word'], self.embeddings2['word'], **self.config_params['model']) def train(self): num_ex = self.config_params['num_valid_to_show'] if num_ex > 0: print('Showing examples') preproc = self.config_params['preproc'] show_ex_fn = preproc['show_ex'] rlut1 = baseline.revlut(self.feat2index1['word']) rlut2 = baseline.revlut(self.feat2index2['word']) self.config_params['train'][ 'after_train_fn'] = lambda model: show_ex_fn(model, self.valid_data, rlut1, rlut2, self.embeddings2[ 'word'], preproc['mxlen'], False, 0, num_ex, reverse=False) super(EncoderDecoderTask, self).train()
class ClassifierTask(Task): def __init__(self, mead_settings_config, **kwargs): super(ClassifierTask, self).__init__(mead_settings_config, **kwargs) @classmethod def task_name(cls): return 'classify' def _create_backend(self, **kwargs): backend = Backend(self.config_params.get('backend', 'tf')) if backend.name == 'dy': import _dynet dy_params = _dynet.DynetParams() dy_params.from_args() dy_params.set_requested_gpus(1) if 'autobatchsz' in self.config_params['train']: self.config_params['train']['trainer_type'] = 'autobatch' dy_params.set_autobatch(True) batched = False else: batched = True dy_params.init() backend.params = {'pc': _dynet.ParameterCollection(), 'batched': batched} backend.load(self.task_name()) return backend def _setup_task(self, **kwargs): super(ClassifierTask, self)._setup_task(**kwargs) if self.config_params.get('preproc', {}).get('clean', False) is True: self.config_params.get('preproc', {})['clean_fn'] = baseline.TSVSeqLabelReader.do_clean logger.info('Clean') else: self.config_params.setdefault('preproc', {}) self.config_params['preproc']['clean_fn'] = None def initialize(self, embeddings): embeddings = read_config_file_or_json(embeddings, 'embeddings') embeddings_set = index_by_label(embeddings) self.dataset = DataDownloader(self.dataset, self.data_download_cache).download() print_dataset_info(self.dataset) vocab_sources = [self.dataset['train_file'], self.dataset['valid_file']] # TODO: make this optional if 'test_file' in self.dataset: vocab_sources.append(self.dataset['test_file']) vocab, self.labels = self.reader.build_vocab(vocab_sources, min_f=Task._get_min_f(self.config_params), vocab_file=self.dataset.get('vocab_file'), label_file=self.dataset.get('label_file')) self.embeddings, self.feat2index = self._create_embeddings(embeddings_set, vocab, self.config_params['features']) baseline.save_vocabs(self.get_basedir(), self.feat2index) def _create_model(self): unif = self.config_params.get('unif', 0.1) model = self.config_params['model'] model['unif'] = model.get('unif', unif) lengths_key = model.get('lengths_key', self.primary_key) if lengths_key is not None: if not lengths_key.endswith('_lengths'): lengths_key = '{}_lengths'.format(lengths_key) model['lengths_key'] = lengths_key if self.backend.params is not None: for k, v in self.backend.params.items(): model[k] = v return baseline.model.create_model(self.embeddings, self.labels, **model) def _load_dataset(self): read = self.config_params['reader'] if 'reader' in self.config_params else self.config_params['loader'] sort_key = read.get('sort_key') bsz, vbsz, tbsz = Task._get_batchsz(self.config_params) self.train_data = self.reader.load( self.dataset['train_file'], self.feat2index, bsz, shuffle=True, sort_key=sort_key, ) self.valid_data = self.reader.load( self.dataset['valid_file'], self.feat2index, vbsz, ) self.test_data = None if 'test_file' in self.dataset: self.test_data = self.reader.load( self.dataset['test_file'], self.feat2index, tbsz, )
class LanguageModelingTask(Task): def __init__(self, logging_config, mead_settings_config, **kwargs): super(LanguageModelingTask, self).__init__(logging_config, mead_settings_config, **kwargs) @classmethod def task_name(cls): return 'lm' def _create_task_specific_reader(self): self._create_vectorizers() reader_params = self.config_params['loader'] reader_params['nctx'] = reader_params.get( 'nctx', self.config_params.get('nctx', self.config_params.get('nbptt', 35))) reader_params['clean_fn'] = reader_params.get( 'clean_fn', self.config_params.get('preproc', {}).get('clean_fn')) reader_params['mxlen'] = self.vectorizers[self.primary_key].mxlen if self.config_params['model'].get('gpus', 1) > 1: reader_params['truncate'] = True return baseline.reader.create_reader( self.task_name(), self.vectorizers, self.config_params['preproc'].get('trim', False), **reader_params) def _create_backend(self, **kwargs): backend = Backend(self.config_params.get('backend', 'tf')) if backend.name == 'pytorch': self.config_params.get('preproc', {})['trim'] = True elif backend.name == 'dy': self.config_params.get('preproc', {})['trim'] = True import _dynet dy_params = _dynet.DynetParams() dy_params.from_args() dy_params.set_requested_gpus(1) if 'autobatchsz' in self.config_params['train']: dy_params.set_autobatch(True) batched = False else: batched = True dy_params.init() backend.params = { 'pc': _dynet.ParameterCollection(), 'batched': batched } backend.load(self.task_name()) return backend def initialize(self, embeddings): embeddings = read_config_file_or_json(embeddings, 'embeddings') embeddings_set = index_by_label(embeddings) self.dataset = DataDownloader(self.dataset, self.data_download_cache).download() print_dataset_info(self.dataset) vocabs = self.reader.build_vocab( [ self.dataset['train_file'], self.dataset['valid_file'], self.dataset['test_file'] ], min_f=Task._get_min_f(self.config_params), vocab_file=self.dataset.get('vocab_file')) self.embeddings, self.feat2index = self._create_embeddings( embeddings_set, vocabs, self.config_params['features']) baseline.save_vocabs(self.get_basedir(), self.feat2index) def _load_dataset(self): tgt_key = self.config_params['loader'].get('tgt_key', self.primary_key) self.train_data = self.reader.load(self.dataset['train_file'], self.feat2index, self.config_params['batchsz'], tgt_key=tgt_key) self.valid_data = self.reader.load(self.dataset['valid_file'], self.feat2index, self.config_params.get( 'valid_batchsz', self.config_params['batchsz']), tgt_key=tgt_key) self.test_data = self.reader.load(self.dataset['test_file'], self.feat2index, 1, tgt_key=tgt_key) def _create_model(self): model = self.config_params['model'] unif = self.config_params.get('unif', 0.1) model['unif'] = model.get('unif', unif) model['batchsz'] = self.config_params['batchsz'] model['tgt_key'] = self.config_params['loader'].get( 'tgt_key', self.primary_key) if self.backend.params is not None: for k, v in self.backend.params.items(): model[k] = v return baseline.model.create_lang_model(self.embeddings, **model) def train(self): self._load_dataset() if self.config_params['train'].get('lr_scheduler_type', None) == 'zaremba': first_range = int( self.config_params['train']['start_decay_epoch'] * self.train_data.steps) self.config_params['train']['bounds'] = [first_range] + list( np.arange(self.config_params['train']['start_decay_epoch'] + 1, self.config_params['train']['epochs'] + 1, dtype=np.int32) * self.train_data.steps) baseline.save_vectorizers(self.get_basedir(), self.vectorizers) model = self._create_model() baseline.train.fit(model, self.train_data, self.valid_data, self.test_data, **self.config_params['train']) baseline.zip_files(self.get_basedir()) self._close_reporting_hooks() @staticmethod def _num_steps_per_epoch(num_examples, nctx, batchsz): rest = num_examples // batchsz return rest // nctx
class ClassifierTask(Task): def __init__(self, logging_config, mead_settings_config, **kwargs): super(ClassifierTask, self).__init__(logging_config, mead_settings_config, **kwargs) @classmethod def task_name(cls): return 'classify' def _create_backend(self, **kwargs): backend = Backend(self.config_params.get('backend', 'tf')) if backend.name == 'dy': import _dynet dy_params = _dynet.DynetParams() dy_params.from_args() dy_params.set_requested_gpus(1) if 'autobatchsz' in self.config_params['train']: self.config_params['train']['trainer_type'] = 'autobatch' dy_params.set_autobatch(True) batched = False else: batched = True dy_params.init() backend.params = { 'pc': _dynet.ParameterCollection(), 'batched': batched } elif backend.name == 'tf': # FIXME this should be registered as well! exporter_type = kwargs.get('exporter_type', 'default') if exporter_type == 'default': from mead.tf.exporters import ClassifyTensorFlowExporter backend.exporter = ClassifyTensorFlowExporter elif exporter_type == 'preproc': from mead.tf.preproc_exporters import ClassifyTensorFlowPreProcExporter import mead.tf.preprocessors backend.exporter = ClassifyTensorFlowPreProcExporter backend.load(self.task_name()) return backend def _setup_task(self, **kwargs): super(ClassifierTask, self)._setup_task(**kwargs) if self.config_params.get('preproc', {}).get('clean', False) is True: self.config_params.get( 'preproc', {})['clean_fn'] = baseline.TSVSeqLabelReader.do_clean print('Clean') else: self.config_params['preproc'] = {} self.config_params['preproc']['clean_fn'] = None def initialize(self, embeddings): embeddings = read_config_file_or_json(embeddings, 'embeddings') embeddings_set = index_by_label(embeddings) self.dataset = DataDownloader(self.dataset, self.data_download_cache).download() print_dataset_info(self.dataset) vocab, self.labels = self.reader.build_vocab( [ self.dataset['train_file'], self.dataset['valid_file'], self.dataset['test_file'] ], min_f=Task._get_min_f(self.config_params), vocab_file=self.dataset.get('vocab_file'), label_file=self.dataset.get('label_file')) self.embeddings, self.feat2index = self._create_embeddings( embeddings_set, vocab, self.config_params['features']) baseline.save_vocabs(self.get_basedir(), self.feat2index) def _create_model(self): unif = self.config_params.get('unif', 0.1) model = self.config_params['model'] model['unif'] = model.get('unif', unif) lengths_key = model.get('lengths_key', self.primary_key) if lengths_key is not None: if not lengths_key.endswith('_lengths'): lengths_key = '{}_lengths'.format(lengths_key) model['lengths_key'] = lengths_key if self.backend.params is not None: for k, v in self.backend.params.items(): model[k] = v return baseline.model.create_model(self.embeddings, self.labels, **model) def _load_dataset(self): self.train_data = self.reader.load( self.dataset['train_file'], self.feat2index, self.config_params['batchsz'], shuffle=True, sort_key=self.config_params['loader'].get('sort_key')) self.valid_data = self.reader.load( self.dataset['valid_file'], self.feat2index, self.config_params.get('valid_batchsz', self.config_params['batchsz'])) self.test_data = self.reader.load( self.dataset['test_file'], self.feat2index, self.config_params.get('test_batchsz', 1))
class LanguageModelingTask(Task): def __init__(self, mead_settings_config, **kwargs): super(LanguageModelingTask, self).__init__(mead_settings_config, **kwargs) @classmethod def task_name(cls): return 'lm' def _create_task_specific_reader(self): self._create_vectorizers() reader_params = self.config_params['reader'] if 'reader' in self.config_params else self.config_params['loader'] reader_params['nctx'] = reader_params.get('nctx', self.config_params.get('nctx', self.config_params.get('nbptt', 35))) reader_params['clean_fn'] = reader_params.get('clean_fn', self.config_params.get('preproc', {}).get('clean_fn')) if reader_params['clean_fn'] is not None and self.config_params['dataset'] != 'SST2': logger.warning('Warning: A reader preprocessing function (%s) is active, it is recommended that all data preprocessing is done outside of baseline to insure data at inference time matches data at training time.', reader_params['clean_fn']) reader_params['mxlen'] = self.vectorizers[self.primary_key].mxlen if self.config_params['model'].get('gpus', 1) > 1: reader_params['truncate'] = True return baseline.reader.create_reader(self.task_name(), self.vectorizers, self.config_params['preproc'].get('trim', False), **reader_params) def _create_backend(self, **kwargs): backend = Backend(self.config_params.get('backend', 'tf')) if backend.name == 'pytorch': self.config_params.get('preproc', {})['trim'] = True elif backend.name == 'dy': self.config_params.get('preproc', {})['trim'] = True import _dynet dy_params = _dynet.DynetParams() dy_params.from_args() dy_params.set_requested_gpus(1) if 'autobatchsz' in self.config_params['train']: dy_params.set_autobatch(True) batched = False else: batched = True dy_params.init() backend.params = {'pc': _dynet.ParameterCollection(), 'batched': batched} backend.load(self.task_name()) return backend def initialize(self, embeddings): embeddings = read_config_file_or_json(embeddings, 'embeddings') embeddings_set = index_by_label(embeddings) self.dataset = DataDownloader(self.dataset, self.data_download_cache).download() print_dataset_info(self.dataset) vocabs = self.reader.build_vocab( [self.dataset['train_file'], self.dataset['valid_file'], self.dataset['test_file']], min_f=Task._get_min_f(self.config_params), vocab_file=self.dataset.get('vocab_file') ) self.embeddings, self.feat2index = self._create_embeddings(embeddings_set, vocabs, self.config_params['features']) baseline.save_vocabs(self.get_basedir(), self.feat2index) def _load_dataset(self): read = self.config_params['reader'] if 'reader' in self.config_params else self.config_params['loader'] tgt_key = read.get('tgt_key', self.primary_key) bsz, vbsz, tbsz = Task._get_batchsz(self.config_params) self.train_data = self.reader.load( self.dataset['train_file'], self.feat2index, bsz, tgt_key=tgt_key ) self.valid_data = self.reader.load( self.dataset['valid_file'], self.feat2index, vbsz, tgt_key=tgt_key ) self.test_data = self.reader.load( self.dataset['test_file'], self.feat2index, 1, tgt_key=tgt_key ) def _create_model(self): model = self.config_params['model'] unif = self.config_params.get('unif', 0.1) model['unif'] = model.get('unif', unif) model['batchsz'] = self.config_params['batchsz'] model['tgt_key'] = self.config_params.get('reader', self.config_params.get('loader', {})).get('tgt_key', self.primary_key) if self.backend.params is not None: for k, v in self.backend.params.items(): model[k] = v return baseline.model.create_lang_model(self.embeddings, **model) def train(self, checkpoint=None): self._load_dataset() if self.config_params['train'].get('lr_scheduler_type', None) == 'zaremba': first_range = int(self.config_params['train']['start_decay_epoch'] * self.train_data.steps) self.config_params['train']['bounds'] = [first_range] + list( np.arange( self.config_params['train']['start_decay_epoch'] + 1, self.config_params['train']['epochs'] + 1, dtype=np.int32 ) * self.train_data.steps ) baseline.save_vectorizers(self.get_basedir(), self.vectorizers) model = self._create_model() train_params = self.config_params['train'] train_params['checkpoint'] = checkpoint metrics = baseline.train.fit(model, self.train_data, self.valid_data, self.test_data, **train_params) baseline.zip_files(self.get_basedir()) self._close_reporting_hooks() return model, metrics @staticmethod def _num_steps_per_epoch(num_examples, nctx, batchsz): rest = num_examples // batchsz return rest // nctx
class EncoderDecoderTask(Task): def __init__(self, mead_settings_config, **kwargs): super(EncoderDecoderTask, self).__init__(mead_settings_config, **kwargs) @classmethod def task_name(cls): return 'seq2seq' def _create_backend(self, **kwargs): backend = Backend(self.config_params.get('backend', 'tf')) if 'preproc' not in self.config_params: self.config_params['preproc'] = {} self.config_params['preproc']['show_ex'] = show_examples if backend.name == 'pytorch': self.config_params['preproc']['trim'] = True elif backend.name == 'dy': import _dynet dy_params = _dynet.DynetParams() dy_params.from_args() dy_params.set_requested_gpus(1) if 'autobatchsz' in self.config_params['train']: self.config_params['train']['trainer_type'] = 'autobatch' dy_params.set_autobatch(True) batched = False else: batched = True dy_params.init() backend.params = {'pc': _dynet.ParameterCollection(), 'batched': batched} self.config_params['preproc']['trim'] = True else: self.config_params['preproc']['trim'] = True backend.load(self.task_name()) return backend def initialize(self, embeddings): embeddings = read_config_file_or_json(embeddings, 'embeddings') embeddings_set = index_by_label(embeddings) self.dataset = DataDownloader(self.dataset, self.data_download_cache).download() print_dataset_info(self.dataset) vocab1, vocab2 = self.reader.build_vocabs( [self.dataset['train_file'], self.dataset['valid_file'], self.dataset['test_file']], min_f=Task._get_min_f(self.config_params), vocab_file=self.dataset.get('vocab_file') ) # To keep the config file simple, share a list between source and destination (tgt) features_src = [] features_tgt = None for feature in self.config_params['features']: if feature['name'] == 'tgt': features_tgt = feature else: features_src += [feature] self.src_embeddings, self.feat2src = self._create_embeddings(embeddings_set, vocab1, features_src) # For now, dont allow multiple vocabs of output baseline.save_vocabs(self.get_basedir(), self.feat2src) self.tgt_embeddings, self.feat2tgt = self._create_embeddings(embeddings_set, {'tgt': vocab2}, [features_tgt]) baseline.save_vocabs(self.get_basedir(), self.feat2tgt) self.tgt_embeddings = self.tgt_embeddings['tgt'] self.feat2tgt = self.feat2tgt['tgt'] def _load_dataset(self): bsz, vbsz, tbsz = Task._get_batchsz(self.config_params) self.train_data = self.reader.load( self.dataset['train_file'], self.feat2src, self.feat2tgt, bsz, shuffle=True, sort_key='{}_lengths'.format(self.primary_key) ) self.valid_data = self.reader.load( self.dataset['valid_file'], self.feat2src, self.feat2tgt, vbsz, shuffle=True ) self.test_data = self.reader.load( self.dataset['test_file'], self.feat2src, self.feat2tgt, tbsz, ) def _create_model(self): self.config_params['model']["unif"] = self.config_params["unif"] model = self.config_params['model'] unif = self.config_params.get('unif', 0.1) model['unif'] = model.get('unif', unif) lengths_key = model.get('src_lengths_key', self.primary_key) if lengths_key is not None: if not lengths_key.endswith('_lengths'): lengths_key = '{}_lengths'.format(lengths_key) model['src_lengths_key'] = lengths_key if self.backend.params is not None: for k, v in self.backend.params.items(): model[k] = v return baseline.model.create_seq2seq_model(self.src_embeddings, self.tgt_embeddings, **self.config_params['model']) def train(self, checkpoint=None): num_ex = self.config_params['num_valid_to_show'] rlut1 = revlut(self.feat2src[self.primary_key]) rlut2 = revlut(self.feat2tgt) if num_ex > 0: logger.info('Showing examples') preproc = self.config_params.get('preproc', {}) show_ex_fn = preproc['show_ex'] self.config_params['train']['after_train_fn'] = lambda model: show_ex_fn(model, self.valid_data, rlut1, rlut2, self.feat2tgt, preproc['mxlen'], False, 0, num_ex, reverse=False) self.config_params['train']['tgt_rlut'] = rlut2 return super(EncoderDecoderTask, self).train(checkpoint)
class ClassifierTask(Task): def __init__(self, mead_settings_config, **kwargs): super(ClassifierTask, self).__init__(mead_settings_config, **kwargs) @classmethod def task_name(cls): return 'classify' def _create_backend(self, **kwargs): backend = Backend(self.config_params.get('backend', 'tf')) if backend.name == 'dy': import _dynet dy_params = _dynet.DynetParams() dy_params.from_args() dy_params.set_requested_gpus(1) if 'autobatchsz' in self.config_params['train']: self.config_params['train']['trainer_type'] = 'autobatch' dy_params.set_autobatch(True) batched = False else: batched = True dy_params.init() backend.params = {'pc': _dynet.ParameterCollection(), 'batched': batched} backend.load(self.task_name()) return backend def _setup_task(self, **kwargs): super(ClassifierTask, self)._setup_task(**kwargs) if self.config_params.get('preproc', {}).get('clean', False) is True: self.config_params.get('preproc', {})['clean_fn'] = baseline.TSVSeqLabelReader.do_clean logger.info('Clean') else: self.config_params['preproc'] = {} self.config_params['preproc']['clean_fn'] = None def initialize(self, embeddings): embeddings = read_config_file_or_json(embeddings, 'embeddings') embeddings_set = index_by_label(embeddings) self.dataset = DataDownloader(self.dataset, self.data_download_cache).download() print_dataset_info(self.dataset) vocab, self.labels = self.reader.build_vocab( [self.dataset['train_file'], self.dataset['valid_file'], self.dataset['test_file']], min_f=Task._get_min_f(self.config_params), vocab_file=self.dataset.get('vocab_file'), label_file=self.dataset.get('label_file') ) self.embeddings, self.feat2index = self._create_embeddings(embeddings_set, vocab, self.config_params['features']) baseline.save_vocabs(self.get_basedir(), self.feat2index) def _create_model(self): unif = self.config_params.get('unif', 0.1) model = self.config_params['model'] model['unif'] = model.get('unif', unif) lengths_key = model.get('lengths_key', self.primary_key) if lengths_key is not None: if not lengths_key.endswith('_lengths'): lengths_key = '{}_lengths'.format(lengths_key) model['lengths_key'] = lengths_key if self.backend.params is not None: for k, v in self.backend.params.items(): model[k] = v return baseline.model.create_model(self.embeddings, self.labels, **model) def _load_dataset(self): read = self.config_params['reader'] if 'reader' in self.config_params else self.config_params['loader'] sort_key = read.get('sort_key') bsz, vbsz, tbsz = Task._get_batchsz(self.config_params) self.train_data = self.reader.load( self.dataset['train_file'], self.feat2index, bsz, shuffle=True, sort_key=sort_key, ) self.valid_data = self.reader.load( self.dataset['valid_file'], self.feat2index, vbsz, ) self.test_data = self.reader.load( self.dataset['test_file'], self.feat2index, tbsz, )