def _build_vectorizer(init_args, conf, pipeline_list): """ Builds a vectorized that converts raw text to feature vectors. The parameters for the vectorizer are specified in the *feature extraction* section of the configuration file. These will be matched to those of the *__init__* method of the vectorizer class and the matching keywords are passed to the vectorizer. The non-matching arguments are simply ignored. The vectorizer converts a corpus into a term frequency matrix. A given source corpus is converted into a term frequency matrix and returned as a numpy *coo_matrix*.The value of the *vectorizer* field in the main configuration file is used as the transformer class. This class has to implement the methods *fit*, *transform* and *fit_transform* as per scikit-learn. """ vectorizer = get_named_object(conf['vectorizer']['class']) # get the names of the arguments that the vectorizer class takes # the object must only take keyword arguments # todo KmeansVectorizer does not declare its parameters explicitly so intersection doesnt work # instead its constructor should take **kwargs, and we can pass in whatever we want with no need to manually check # which parameters are valid for that object init_args.update(get_intersection_of_parameters(vectorizer, conf['feature_extraction'], 'vect')) init_args.update(get_intersection_of_parameters(vectorizer, conf['vectorizer'], 'vect')) init_args.update(get_intersection_of_parameters(vectorizer, conf, 'vect')) # get debug_level from conf file pipeline_list.append(('vect', vectorizer()))
def _build_classifiers(classifiers_conf): for i, clf_name in enumerate(classifiers_conf): if not classifiers_conf[clf_name]: continue # ignore disabled classifiers if not classifiers_conf[clf_name]['run']: logging.debug('Ignoring classifier %s' % clf_name) continue clf = get_named_object(clf_name) init_args = get_intersection_of_parameters(clf, classifiers_conf[clf_name]) yield clf(**init_args)
def _build_feature_selector(init_args, feature_selection_conf, pipeline_list): """ If feature selection is required, this function appends a selector object to pipeline_list and its configuration to configuration. Note this function modifies (appends to) its input arguments """ if feature_selection_conf['run']: method = get_named_object(feature_selection_conf['method']) scoring = feature_selection_conf.get('scoring_function') logging.info('Scoring function is %s', scoring) scoring_func = get_named_object(scoring) if scoring else None # the parameters for steps in the Pipeline are defined as # <component_name>__<arg_name> - the Pipeline (which is actually a # BaseEstimator) takes care of passing the correct arguments down # along the pipeline, provided there are no name clashes between the # keyword arguments of two consecutive transformers. init_args.update(get_intersection_of_parameters(method, feature_selection_conf, 'fs')) logging.info('FS method is %s', method) pipeline_list.append(('fs', method(scoring_func)))