class FeaturesPerceptronRanker(BasePerceptronRanker): """Base class for global ranker for whole trees, based on features.""" def __init__(self, cfg): super(FeaturesPerceptronRanker, self).__init__(cfg) if not cfg: cfg = {} self.feats = ['bias: bias'] self.vectorizer = None self.normalizer = None self.binarize = cfg.get('binarize', False) # initialize feature functions if 'features' in cfg: self.feats.extend(cfg['features']) self.feats = Features(self.feats, cfg.get('intermediate_features', [])) def _extract_feats(self, tree, da): feats = self.vectorizer.transform( [self.feats.get_features(tree, {'da': da})]) if self.normalizer: feats = self.normalizer.transform(feats) return feats[0] def _init_training(self, das_file, ttree_file, data_portion): super(FeaturesPerceptronRanker, self)._init_training(das_file, ttree_file, data_portion) # precompute training data features X = [] for da, tree in zip(self.train_das, self.train_trees): X.append(self.feats.get_features(tree, {'da': da})) if self.prune_feats > 1: self._prune_features(X) # vectorize and binarize or normalize (+train vectorizer/normalizer) if self.binarize: self.vectorizer = DictVectorizer(sparse=False, binarize_numeric=True) self.train_feats = self.vectorizer.fit_transform(X) else: self.vectorizer = DictVectorizer(sparse=False) self.normalizer = StandardScaler(copy=False) self.train_feats = self.normalizer.fit_transform( self.vectorizer.fit_transform(X)) log_info('Features matrix shape: %s' % str(self.train_feats.shape)) def _prune_features(self, X): """Prune features – remove all entries from X that involve features not having a specified minimum occurrence count. """ counts = defaultdict(int) for inst in X: for key in inst.iterkeys(): counts[key] += 1 for inst in X: for key in inst.keys(): if counts[key] < self.prune_feats: del inst[key]
class FeaturesPerceptronRanker(BasePerceptronRanker): """Base class for global ranker for whole trees, based on features.""" def __init__(self, cfg): super(FeaturesPerceptronRanker, self).__init__(cfg) if not cfg: cfg = {} self.feats = ['bias: bias'] self.vectorizer = None self.normalizer = None self.binarize = cfg.get('binarize', False) # initialize feature functions if 'features' in cfg: self.feats.extend(cfg['features']) self.feats = Features(self.feats, cfg.get('intermediate_features', [])) def _extract_feats(self, tree, da): feats = self.vectorizer.transform([self.feats.get_features(tree, {'da': da})]) if self.normalizer: feats = self.normalizer.transform(feats) return feats[0] def _init_training(self, das_file, ttree_file, data_portion): super(FeaturesPerceptronRanker, self)._init_training(das_file, ttree_file, data_portion) # precompute training data features X = [] for da, tree in zip(self.train_das, self.train_trees): X.append(self.feats.get_features(tree, {'da': da})) if self.prune_feats > 1: self._prune_features(X) # vectorize and binarize or normalize (+train vectorizer/normalizer) if self.binarize: self.vectorizer = DictVectorizer(sparse=False, binarize_numeric=True) self.train_feats = self.vectorizer.fit_transform(X) else: self.vectorizer = DictVectorizer(sparse=False) self.normalizer = StandardScaler(copy=False) self.train_feats = self.normalizer.fit_transform(self.vectorizer.fit_transform(X)) log_info('Features matrix shape: %s' % str(self.train_feats.shape)) def _prune_features(self, X): """Prune features – remove all entries from X that involve features not having a specified minimum occurrence count. """ counts = defaultdict(int) for inst in X: for key in inst.iterkeys(): counts[key] += 1 for inst in X: for key in inst.keys(): if counts[key] < self.prune_feats: del inst[key]
def _init_training(self, das_file, ttree_file, data_portion): super(FeaturesPerceptronRanker, self)._init_training(das_file, ttree_file, data_portion) # precompute training data features X = [] for da, tree in zip(self.train_das, self.train_trees): X.append(self.feats.get_features(tree, {'da': da})) if self.prune_feats > 1: self._prune_features(X) # vectorize and binarize or normalize (+train vectorizer/normalizer) if self.binarize: self.vectorizer = DictVectorizer(sparse=False, binarize_numeric=True) self.train_feats = self.vectorizer.fit_transform(X) else: self.vectorizer = DictVectorizer(sparse=False) self.normalizer = StandardScaler(copy=False) self.train_feats = self.normalizer.fit_transform( self.vectorizer.fit_transform(X)) log_info('Features matrix shape: %s' % str(self.train_feats.shape))
def _init_training(self, das_file, ttree_file, data_portion): super(FeaturesPerceptronRanker, self)._init_training(das_file, ttree_file, data_portion) # precompute training data features X = [] for da, tree in zip(self.train_das, self.train_trees): X.append(self.feats.get_features(tree, {'da': da})) if self.prune_feats > 1: self._prune_features(X) # vectorize and binarize or normalize (+train vectorizer/normalizer) if self.binarize: self.vectorizer = DictVectorizer(sparse=False, binarize_numeric=True) self.train_feats = self.vectorizer.fit_transform(X) else: self.vectorizer = DictVectorizer(sparse=False) self.normalizer = StandardScaler(copy=False) self.train_feats = self.normalizer.fit_transform(self.vectorizer.fit_transform(X)) log_info('Features matrix shape: %s' % str(self.train_feats.shape))