def std_scaler(trfm, col_names, **kwargs): """ Parameters ---------- trfm : Contains the Sklearn's Standard Scaler preprocessing instance col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to Standard Scaler preprocessing. """ derived_flds = list() pp_dict = dict() if is_present("labelBinarizer",col_names): derived_flds_hidden = kwargs['derived_fld'] if derived_flds_hidden: derived_flds.extend(derived_flds_hidden) derived_colnames = get_derived_colnames('standardScaler', col_names) for col_name_idx in range(len(col_names)): apply_inner = list() apply_inner.append(pml.Apply( function='-', Constant=[pml.Constant( dataType="double", # <--------------------- valueOf_=unround_scalers(trfm.mean_[col_name_idx]) )], FieldRef=[pml.FieldRef(field=col_names[col_name_idx])] )) apply_outer = pml.Apply( Apply_member=apply_inner, function='/', Constant=[pml.Constant( dataType="double", # <---------------------------- valueOf_=unround_scalers(trfm.scale_[col_name_idx]) )] ) derived_flds.append(pml.DerivedField( Apply=apply_outer, name=derived_colnames[col_name_idx], optype="continuous", dataType="double" )) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames return pp_dict
def rbst_scaler(trfm, col_names): """ Parameters ---------- trfm : Contains the Sklearn's RobustScaler preprocessing instance col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to RobustScaler preprocessing. """ pp_dict = dict() derived_flds = list() derived_colnames = get_derived_colnames('robustScaler', col_names) for col_name_idx in range(len(col_names)): if (col_names[col_name_idx] not in exception_cols): apply_inner = list() apply_inner.append(pml.Apply( function='-', Constant=[pml.Constant( dataType="double", # <--------------------- valueOf_=unround_scalers(trfm.center_[col_name_idx]) )], FieldRef=[pml.FieldRef(field=col_names[col_name_idx])], Extension=[pml.Extension(name='scaling', anytypeobjs_=['RobustScaler'])] )) apply_outer = pml.Apply( Apply_member=apply_inner, function='/', Constant=[pml.Constant( dataType="double", # <---------------------------- valueOf_=unround_scalers(trfm.scale_[col_name_idx]) )] ) derived_flds.append(pml.DerivedField( Apply=apply_outer, name=derived_colnames[col_name_idx], optype="continuous", dataType="double" )) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames return pp_dict
def min_max_scaler(trfm, col_names): """ Parameters ---------- trfm : Contains the Sklearn's MinMaxScaler preprocessing instance col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to MinMaxScaler preprocessing. """ pp_dict = dict() derived_flds = list() # col_names = list(filter(lambda x: x not in exception_cols, col_names)) derived_colnames = get_derived_colnames("minMaxScaler", col_names) for col_name_idx in range(len(col_names)): if(col_names[col_name_idx] not in exception_cols): apply_inner = list() apply_inner.append(pml.Apply( function='*', Constant=[pml.Constant( dataType="double", valueOf_=unround_scalers(trfm.scale_[col_name_idx]) )], FieldRef=[pml.FieldRef(field=col_names[col_name_idx])] )) apply_outer = pml.Apply( Apply_member=apply_inner, function='+', Constant=[pml.Constant( dataType="double", valueOf_=unround_scalers(trfm.min_[col_name_idx]) )] ) derived_flds.append(pml.DerivedField( Apply=apply_outer, name=derived_colnames[col_name_idx], optype="continuous", dataType="double" )) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames return pp_dict
def pca(trfm, col_names): """ Parameters ---------- trfm : Contains the Sklearn's PCA preprocessing instance col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to PCA preprocessing. """ pca.counter += 1 pp_dict = dict() derived_flds = list() derived_colnames = list() val = trfm.mean_ zero = 0.0 for preprocess_idx in range(trfm.n_components): add = list() for pca_idx in range(trfm.n_features_): apply_inner = pml.Apply(function='-', Constant=[pml.Constant(dataType="double", valueOf_=val[pca_idx])], FieldRef=[pml.FieldRef(field=col_names[pca_idx])]) apply_outer = pml.Apply(function="*", Apply_member=[apply_inner], Constant=[pml.Constant(dataType="double", valueOf_=zero if trfm.components_[preprocess_idx][ pca_idx] == 0.0 else trfm.components_[preprocess_idx][pca_idx])]) add.append(apply_outer) app0 = pml.Apply(function="sum", Apply_member=add) derived_flds.append(pml.DerivedField(Apply=app0, dataType="double", optype="continuous", name="PCA" + str(pca.counter) + "-" + str(preprocess_idx))) name = derived_flds[preprocess_idx].get_name() derived_colnames.append(name) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames return pp_dict
def tfidf_vectorizer(trfm, col_names): """ Parameters ---------- trfm : Contains the Sklearn's TfIdfVectorizer preprocessing instance col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to TfIdfVectorizer preprocessing. """ pp_dict = dict() features = trfm.get_feature_names() idfs = trfm.idf_ extra_features = list(trfm.vocabulary_.keys()) derived_flds = list() derived_colnames = get_derived_colnames('tfidf@[' + col_names[0] + ']', features) derived_flds.append( pml.DerivedField(name='lowercase(' + col_names[0] + ')', optype='categorical', dataType='string', Apply=pml.Apply(function='lowercase', FieldRef=[pml.FieldRef(field=col_names[0])]))) for feat_idx, idf in zip(range(len(features)), idfs): derived_flds.append(pml.DerivedField( name=derived_colnames[feat_idx], optype='continuous', dataType='double', Apply=pml.Apply(function='*', TextIndex=[pml.TextIndex(textField='lowercase(' + col_names[0] + ')', wordSeparatorCharacterRE='\s+', tokenize='true', Constant=pml.Constant(valueOf_=features[feat_idx]), Extension=[pml.Extension(anytypeobjs_=[extra_features[feat_idx]])])], Constant=[pml.Constant(valueOf_=idf)]) )) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames pp_dict['pp_feat_name'] = col_names[0] pp_dict['pp_feat_class_lbl'] = list() return pp_dict
def polynomial_features(trfm, col_names): """ Parameters ---------- trfm : Contains the Sklearn's PolynomialFeatures preprocessing instance. col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to PolynomialFeatures preprocessing. """ polynomial_features.poly_ctr += 1 pp_dict = dict() derived_flds = [] derived_colnames = [] for polyfeat_idx in range(trfm.powers_.shape[0]): apply_inner_container = [] for col_name_idx in range(len(col_names)): val = int(trfm.powers_[polyfeat_idx][col_name_idx]) apply_inner = pml.Apply( function='pow', Constant=[pml.Constant( dataType="integer", valueOf_=val )], FieldRef=[pml.FieldRef(field=col_names[col_name_idx])]) apply_inner_container.append(apply_inner) apply_outer = pml.Apply(function="product", Apply_member=apply_inner_container ) derived_flds.append(pml.DerivedField( Apply=apply_outer, dataType="double", optype="continuous", name="poly" + str(polynomial_features.poly_ctr) + '-' + "x" + str(polyfeat_idx) )) name = derived_flds[polyfeat_idx].get_name() derived_colnames.append(name) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames return pp_dict
def binarizer(trfm, col_names): """ Parameters ---------- trfm : Contains the Sklearn's Binarizer preprocessing instance. col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to Binarizer preprocessing. """ pp_dict = dict() derived_flds = list() derived_colnames = get_derived_colnames("binarizer", col_names) for col_name_idx in range(len(col_names)): apply_outer = pml.Apply( function='threshold', Constant=[pml.Constant( dataType="double", valueOf_=trfm.threshold )], FieldRef=[pml.FieldRef(field=col_names[col_name_idx])]) derived_flds.append(pml.DerivedField( Apply=apply_outer, name=derived_colnames[col_name_idx], optype="continuous", dataType="double" )) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames return pp_dict
def imputer(trfm, col_names, **kwargs): """ Parameters ---------- trfm : Contains the Sklearn's Imputer preprocessing instance col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to Imputer preprocessing. """ original_col_names = imputer.col_names derived_colnames = col_names pp_dict = dict() derived_flds = list() model = kwargs['model'] mining_strategy = trfm.strategy if "mean" in mining_strategy: mining_strategy = "asMean" elif "median" in mining_strategy: mining_strategy = "asMedian" elif "most_frequent" in mining_strategy: mining_strategy = "asMode" mining_replacement_val = trfm.statistics_ if not any_in(original_col_names, col_names): derived_colnames = get_derived_colnames('imputer', col_names) for col_name_idx in range(len(col_names)): if (col_names[col_name_idx] not in exception_cols): const_list = list() apply_inner = list() apply_inner.append(pml.Apply(function='isMissing', FieldRef=[pml.FieldRef(field=col_names[col_name_idx])])) const_obj = pml.Constant( dataType="double", # <--------------------- valueOf_=mining_replacement_val[col_name_idx] ), fieldref_obj = pml.FieldRef(field=col_names[col_name_idx]) fieldref_obj.original_tagname_ = "FieldRef" const_list.append(const_obj[0]) const_list.append(fieldref_obj) apply_outer = pml.Apply( Apply_member=apply_inner, function='if', Constant=const_list ) derived_flds.append(pml.DerivedField( Apply=apply_outer, name=derived_colnames[col_name_idx], optype="continuous", dataType="double" )) else: pp_dict['mining_strategy'] = mining_strategy pp_dict['mining_replacement_val'] = mining_replacement_val pp_dict['mining_attributes'] = col_names pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames return pp_dict