def __init__(self, vectorizers, trim=False, truncate=False, mxlen=-1, **kwargs): super().__init__() self.vectorizers = vectorizers self.trim = trim self.truncate = truncate label_vectorizer_spec = kwargs.get('label_vectorizer', None) if label_vectorizer_spec: cache = label_vectorizer_spec.get("data_download_cache", os.path.expanduser("~/.bl-data")) if 'model_file' in label_vectorizer_spec: label_vectorizer_spec['model_file'] = SingleFileDownloader( label_vectorizer_spec['model_file'], cache).download() if 'vocab_file' in label_vectorizer_spec: label_vectorizer_spec['vocab_file'] = SingleFileDownloader( label_vectorizer_spec['vocab_file'], cache).download() if 'transform' in label_vectorizer_spec: label_vectorizer_spec['transform_fn'] = label_vectorizer_spec[ 'transform'] if 'transform_fn' in label_vectorizer_spec and isinstance( label_vectorizer_spec['transform_fn'], str): label_vectorizer_spec['transform_fn'] = eval( label_vectorizer_spec['transform_fn']) self.label_vectorizer = create_vectorizer(**label_vectorizer_spec) else: self.label_vectorizer = Dict1DVectorizer(fields='y', mxlen=mxlen) self.label2index = { Offsets.VALUES[Offsets.PAD]: Offsets.PAD, Offsets.VALUES[Offsets.GO]: Offsets.GO, Offsets.VALUES[Offsets.EOS]: Offsets.EOS }
def _create_vectorizers(self, vecs_set=None): """Read the `features` section of the mead config. This sections contains both embedding info and vectorizers Then use the vectorizer sub-section to instantiate the vectorizers and return them in a ``dict`` with name keyed off of the `features->name` and value of `vectorizer` :return: (``dict``) - A dictionary of the vectorizers keyed by feature name """ self.vectorizers = {} features = self.config_params['features'] assert_unique_feature_names([f['name'] for f in features]) self.primary_key = features[0]['name'] for feature in self.config_params['features']: key = feature['name'] if '-' in key: raise ValueError('Feature names cannot contain "-". Found feature named "{}"'.format(key)) if feature.get('primary', False) is True: self.primary_key = key vectorizer_section = feature.get('vectorizer', {}) vecs_global_config = {'type': 'token1d'} if 'label' in vectorizer_section: vecs_global_config = vecs_set.get(vectorizer_section['label']) vectorizer_section = {**vecs_global_config, **vectorizer_section} vectorizer_section['data_download_cache'] = self.data_download_cache vec_file = vectorizer_section.get('file') if vec_file: vec_file = SingleFileDownloader(vec_file, self.data_download_cache).download() vectorizer_section['file'] = vec_file vectorizer_section['mxlen'] = vectorizer_section.get('mxlen', self.config_params.get('preproc', {}).get('mxlen', -1)) vectorizer_section['mxwlen'] = vectorizer_section.get('mxwlen', self.config_params.get('preproc', {}).get('mxwlen', -1)) if 'model_file' in vectorizer_section: vectorizer_section['model_file'] = SingleFileDownloader(vectorizer_section['model_file'], self.data_download_cache).download() if 'vocab_file' in vectorizer_section: vectorizer_section['vocab_file'] = SingleFileDownloader(vectorizer_section['vocab_file'], self.data_download_cache).download() if 'transform' in vectorizer_section: vectorizer_section['transform_fn'] = vectorizer_section['transform'] if 'transform_fn' in vectorizer_section and isinstance(vectorizer_section['transform_fn'], str): vectorizer_section['transform_fn'] = eval(vectorizer_section['transform_fn']) vectorizer = baseline.create_vectorizer(**vectorizer_section) self.vectorizers[key] = vectorizer
def __init__(self, vectorizers, trim=False, truncate=False, mxlen=-1, **kwargs): super().__init__() self.vectorizers = vectorizers self.trim = trim self.truncate = truncate label_vectorizer_spec_dict = kwargs.get('label_vectorizers', {'y': Dict1DVectorizer(fields='y', mxlen=mxlen)}) self.label_vectorizers = {} self.label2index = {} for k, label_vectorizer_spec in label_vectorizer_spec_dict.items(): if 'label' not in label_vectorizer_spec: label_vectorizer_spec['label'] = k cache = label_vectorizer_spec.get("data_download_cache", os.path.expanduser("~/.bl-data")) if 'model_file' in label_vectorizer_spec: label_vectorizer_spec['model_file'] = SingleFileDownloader(label_vectorizer_spec['model_file'], cache).download() if 'vocab_file' in label_vectorizer_spec: label_vectorizer_spec['vocab_file'] = SingleFileDownloader(label_vectorizer_spec['vocab_file'], cache).download() label_vectorizer = create_vectorizer(**label_vectorizer_spec) self.label_vectorizers[k] = label_vectorizer self.label2index[k] = { Offsets.VALUES[Offsets.PAD]: Offsets.PAD, Offsets.VALUES[Offsets.GO]: Offsets.GO, Offsets.VALUES[Offsets.EOS]: Offsets.EOS }