示例#1
0
def process(
    naive_file,
    treated_file,
    metadata_file,
    resistance_files,
    outfile,
    subtype="All",
    truncate=[41, 235],
):
    print("reading sequences and metadata")
    raw_sequences, consensus = reader(naive_file, treated_file, truncate)

    metadata = read_metadata(metadata_file)

    print(f"choosing {subtype} subtype(s)")
    chosen_sequences, dataset_subtypes = choose_subtype(
        raw_sequences, metadata, subtype)

    print("Filling with consensus AAs")
    AA_sequences = fill_consensus_AAs(chosen_sequences, consensus)
    freqs = get_single_AA_freqs(AA_sequences.drop("label", axis=1))
    single_AA_sequences = get_single_AAs(AA_sequences, freqs)

    print("OneHot encoding")
    columns_to_encode = single_AA_sequences.columns.drop("label")
    encoder = OneHotEncoder(use_cat_names=True,
                            handle_unknown="ignore",
                            cols=columns_to_encode.tolist())
    encoded_sequences = encoder.fit_transform(single_AA_sequences)

    print("removing consensus features")
    features_to_remove = get_features_to_remove(dataset_subtypes)
    total_sequences = encoded_sequences.drop(columns=features_to_remove,
                                             errors="ignore")

    total_sequences["encoded_label"] = total_sequences["label"].apply({
        "treated":
        1,
        "naive":
        0
    }.get)

    drms = get_all_DRMs()
    total_sequences["hasDRM"] = (total_sequences.filter(
        drms, axis=1).any(axis=1).astype(int))

    total_sequences["is_resistant"] = (total_sequences[[
        "encoded_label", "hasDRM"
    ]].any(axis=1).astype(int))

    print("getting resistance scores")
    resistance_scores = get_resistance_scores(resistance_files)

    print("saving dataset to disk")
    joined = total_sequences.join(resistance_scores)
    joined.to_csv(outfile, sep="\t", index=True, header=True)
示例#2
0
def one_hot_encoded_result(df_orig):
    df = df_orig.copy(deep=True)
    one_hot_enc = OneHotEncoder(cols=['ordinal_result'], use_cat_names=True)
    one_hot_cols = one_hot_enc.fit_transform(df[['ordinal_result']])
    new_one_hot_col_names = [col[:-2] for col in one_hot_cols.columns]
    mapping_dict = {
        old: new
        for old, new in zip(one_hot_cols.columns, new_one_hot_col_names)
    }
    one_hot_cols.rename(columns=mapping_dict, inplace=True)
    one_hot_cols = one_hot_cols[sorted(one_hot_cols.columns)]
    df_with_new_cols = pd.concat([df, one_hot_cols], axis=1)
    return df_with_new_cols
示例#3
0
	def one_hot_encode(self,data):
		"""
		複数のカテゴリ変数をベクトル化して、それぞれ変換規則を保存する関数です。
		ベクトル化したデータセットを返します。
		変換規則はenc_dictに保存されています。

		:param data: 学習で用いるデータセット(Dataset型の属性dataを受け取る)
		"""
		#self.enc_dict={}
		oe=OneHotEncoder(cols=self.columns,handle_unknown="inpute")
		oe_data=oe.fit_transform(data)
		self.model=oe
		#oe_data=oe_data.ix[:,org_order]
		return oe_data
    def _encode_categories(self):
        """
        This private method stands for encoding categorical variables. Label encoding used for ordinal categories and
        one-hot encoding used for nominal categories.
        """

        logging.info(f'#{self._index()} - Encoding categorical columns...')
        # get column names for categorical and numerical features
        categorical_vars = self.X.select_dtypes(include='object').columns
        numerical_vars = self.X.columns.difference(categorical_vars)

        ordinal = pd.Index([
            'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC',
            'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC'
        ])
        nominal = categorical_vars.difference(ordinal)

        standard_mapping = {
            'NA': 0,
            'Po': 1,
            'Fa': 2,
            'TA': 3,
            'Gd': 4,
            'Ex': 5
        }
        mapping_for_ordinals = [{
            'col': column,
            'mapping': standard_mapping
        } for column in ordinal]

        x_num = self.X[numerical_vars]
        x_test_num = self.X_test[numerical_vars]

        # one hot encode categorical columns
        one_hot_encoder = OneHotEncoder(use_cat_names=True)
        label_encoder = OrdinalEncoder(drop_invariant=True,
                                       mapping=mapping_for_ordinals,
                                       handle_unknown='error')

        x_cat_nom = one_hot_encoder.fit_transform(self.X[nominal])
        x_cat_ord = label_encoder.fit_transform(self.X[ordinal])
        x_test_cat_nom = one_hot_encoder.transform(self.X_test[nominal])
        x_test_cat_ord = label_encoder.transform(self.X_test[ordinal])

        self.X = x_num.join(x_cat_ord).join(x_cat_nom)
        self.X_test = x_test_num.join(x_test_cat_ord).join(x_test_cat_nom)
        logging.info(f'#{self._step_index} - DONE!')
示例#5
0
def preproc_data(data, features):
    '''
    Simple preproc data:* LabelEncoded target
                        * One Hot Encoding cat_features
                        * Clean Nans as .median()
                        * split data on X, y

    data: pd.DataFrame()
    cat_features: list() # categorical variables in df
    '''
    # LabelEncoded Target
    for i in features.items():
        if 'target' in i:
            target_col = i[0]
            data[target_col] = data[target_col].astype('category').cat.codes

    y = data[target_col]
    X = data.drop([target_col], axis=1)

    cat_features = []
    for feature in features.items():
        if ('categorical'
                in feature) and (X[feature[0]].nunique(dropna=False) > 2):
            cat_features.append(feature[0])

    # LabelEncoded Binary Features
    for feature in X.columns:
        if (X[feature].nunique(dropna=False) < 3):
            X[feature] = X[feature].astype('category').cat.codes
            if len(cat_features) > 0:
                if feature in cat_features:
                    cat_features.remove(feature)

    # One Hot Encoding
    if len(cat_features) > 0:
        encoder = OneHotEncoder(cols=cat_features, drop_invariant=True)
        X = encoder.fit_transform(X)

    # Nans
    nan_columns = list(X.columns[X.isnull().sum() > 0])
    if nan_columns:
        for nan_column in nan_columns:
            X[nan_column + 'isNAN'] = pd.isna(X[nan_column]).astype('uint8')
        X[nan_columns].fillna(X[nan_columns].median(), inplace=True)

    return (X, y)
示例#6
0
def main():
    # Preprocess the data
    # start your code here

    # Load data
    data = pd.read_csv("bank.csv")

    # Fix typo in column name
    data.rename(columns={"subcribed": "subscribed"}, inplace=True)

    # Encoding features
    data = data.replace({"yes": 1, "no": 0})
    ohe = OneHotEncoder(
        cols=["job", "marital", "education", "contact", "month", "poutcome"],
        use_cat_names=True,
        return_df=True,
    )
    data = ohe.fit_transform(data)

    # print(data.head())

    # Get features and target
    X = data.drop(columns=["subscribed"])
    y = data["subscribed"]

    # Split training and testing data
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=100)

    # end your code here

    # print(
    #     "\n\nDecision Tree: -------------------------------------------------------------------------\n\n"
    # )
    # # start your code here

    # tree_classifier = DecisionTreeClassifier(
    #     max_depth=4,
    #     max_leaf_nodes=4,
    #     random_state=100,
    # )

    # tree_classifier.fit(X_train, y_train)
    # y_pred_tree = tree_classifier.predict(X_test)
    # evaluate(y_test, y_pred_tree)

    # # feature_imp_tree = pd.Series(
    # #     tree_classifier.feature_importances_, index=X_train.columns
    # # ).sort_values(ascending=False)[:10]
    # # print(feature_imp_tree)

    # # plt.figure(figsize=(20, 10))

    # # plot_tree(
    # #     tree_classifier,
    # #     feature_names=X_train.columns,
    # #     class_names=["no", "yes"],
    # #     rounded=True,
    # # )
    # # plt.savefig("decision_tree.svg", bbox_inches="tight")
    # # plt.show()

    # # end your code here

    # print(
    #     "\n\nRandom Forest: -------------------------------------------------------------------------\n\n"
    # )
    # # start your code here
    # rf_classifier = RandomForestClassifier(
    #     # bootstrap=False,
    #     criterion="entropy",
    #     max_depth=9,
    #     max_leaf_nodes=21,
    #     min_samples_leaf=5,
    #     random_state=100,
    # )

    # rf_classifier.fit(X_train, y_train)
    # y_pred_rf = rf_classifier.predict(X_test)
    # evaluate(y_test, y_pred_rf)

    # feature_imp_rf = pd.Series(
    #     rf_classifier.feature_importances_, index=X_train.columns
    # ).sort_values(ascending=False)[:10]
    # print(feature_imp_rf)
    # # end your code here

    print(
        "\n\nXGBoost: -------------------------------------------------------------------------\n\n"
    )
    # start your code here
    xgb_classifier = xgb.XGBClassifier(
        objective="binary:logistic",
        learning_rate=0.1,
        max_depth=3,
        min_child_weight=5,
        use_label_encoder=False,
        colsample_bytree=0.3,
    )

    xgb_classifier.fit(X_train, y_train)
    y_pred_xgb = xgb_classifier.predict(X_test)
    evaluate(y_test, y_pred_xgb)
示例#7
0
    score = roc_auc_score(y_test, y_pred)
    return score


######### Creating objects for 2 classification models.
logit = LogisticRegression(random_state=SEED)
rf = RandomForestClassifier(random_state=SEED)

###################################################################################################
######### Apply One Hot Encoding
from category_encoders import OneHotEncoder
onehot_enc = OneHotEncoder(cols=X_Columns)
onehot_enc.fit(X_train, y_train)

print('Original number of features: \n', X_train.shape[1], "\n")
data_ohe_train = onehot_enc.fit_transform(X_train)
data_ohe_test = onehot_enc.transform(X_test)
print('Features after OHE: \n', data_ohe_train.shape[1])

######### Logistic Regression
onehot_logit_score = get_score(logit, data_ohe_train, y_train, data_ohe_test,
                               y_test)
print('Logistic Regression score with One hot encoding:', onehot_logit_score)

######### Random Forest
onehot_rf_score = get_score(rf, data_ohe_train, y_train, data_ohe_test, y_test)
print('Random Forest score with One hot encoding:', onehot_logit_score)

###################################################################################################
######### Apply Hashing Encoding
from category_encoders import HashingEncoder
def aggregate_per_time_interval(date_interval):

    ### Importing
    customer_data = pd.read_csv('Data/olist_customers_dataset.csv')
    geolocation_data = pd.read_csv('Data/olist_geolocation_dataset.csv')
    order_items_data = pd.read_csv('Data/olist_order_items_dataset.csv')
    order_payments_data = pd.read_csv('Data/olist_order_payments_dataset.csv')
    order_reviews_data = pd.read_csv('Data/olist_order_reviews_dataset.csv')
    olist_order_data = pd.read_csv('Data/olist_orders_dataset.csv')
    olist_products_data = pd.read_csv('Data/olist_products_dataset.csv')
    olist_sellers_data = pd.read_csv('Data/olist_sellers_dataset.csv')
    olist_product_category_data = pd.read_csv(
        'Data/product_category_name_translation.csv')

    ### Converts column of interest to datetime format

    olist_order_data['order_purchase_timestamp'] = pd.to_datetime(
        olist_order_data['order_purchase_timestamp'])

    ### Keeps dates that are between the given date limits

    mask = (olist_order_data['order_purchase_timestamp'] >=
            date_interval[0]) & (olist_order_data['order_purchase_timestamp'] <
                                 date_interval[1])
    olist_order_data = olist_order_data[mask]

    ### Rest of function is the same as in first notebook of the project

    ### Olist_products_dataset merge to get product category name in english
    olist_products_data = olist_products_data.merge(
        olist_product_category_data, how='left', on='product_category_name')

    ### Merge order items dataset with products dataset
    order_items_data = order_items_data.merge(olist_products_data,
                                              how='left',
                                              on='product_id')

    ### Count number of occurrences for each order ID
    count = order_items_data.groupby('order_id').count().iloc[:, 0].rename(
        'n_items per order')

    ### Numeric data will be aggregated by mean
    num_order_items_data = pd.concat([
        order_items_data['order_id'],
        order_items_data.select_dtypes('float64')
    ],
                                     axis=1)

    num_order_items_data = num_order_items_data.groupby('order_id').mean()

    ### Aggregate each order's products category names by its most frequent value
    cat_order_items_data = order_items_data[[
        'order_id', 'product_category_name_english'
    ]].groupby('order_id').agg(lambda g: g.value_counts().index[0]
                               if np.any(g.notnull()) else np.nan)

    order_items_data = pd.concat(
        [count, num_order_items_data, cat_order_items_data], axis=1)

    olist_order_data = olist_order_data.merge(order_items_data,
                                              how='left',
                                              on='order_id')

    ### Number of payments
    ###1. Count the number

    ### Count number of payments per order

    count = order_payments_data.groupby('order_id').count().iloc[:, 0].rename(
        'n_payments per order')

    ### One hot encode payment type feature

    enc = OneHotEncoder(cols=['payment_type'], use_cat_names=True)
    order_payments_data = enc.fit_transform(order_payments_data)

    order_payments_data = order_payments_data.drop('payment_type_not_defined',
                                                   axis=1)

    order_payments_data = order_payments_data.groupby('order_id').mean()

    order_payments_data = pd.concat([order_payments_data, count], axis=1)

    olist_order_data = olist_order_data.merge(order_payments_data,
                                              how='left',
                                              on='order_id')

    ### Number of reviews per order

    count = order_reviews_data.groupby('order_id').count().iloc[:, 0].rename(
        'n_reviews per order').astype('float64')

    order_reviews_data = order_reviews_data[['order_id', 'review_score'
                                             ]].groupby('order_id').mean()

    order_reviews_data = pd.concat([count, order_reviews_data], axis=1)

    olist_order_data = olist_order_data.merge(order_reviews_data,
                                              how='left',
                                              on='order_id')

    ### Merging customer table with order tables

    customer_data = customer_data.merge(olist_order_data,
                                        how='inner',
                                        on='customer_id')

    ### Cutomer data aggregation
    count = customer_data.groupby(
        'customer_unique_id').count().iloc[:,
                                           0].rename('n_orders per customer')

    ### Numeric features aggregated by mean
    numeric_customer_data = pd.concat([
        customer_data.select_dtypes('float64'),
        customer_data['customer_unique_id']
    ],
                                      axis=1)

    numeric_customer_data = numeric_customer_data.groupby(
        'customer_unique_id').mean()

    ### Categorical features aggregated by most frequent value
    cat_customer_data = customer_data[[
        'customer_unique_id', 'product_category_name_english'
    ]].groupby('customer_unique_id').agg(lambda g: g.value_counts().index[0]
                                         if np.any(g.notnull()) else np.nan)

    customer_data = pd.concat(
        [count, numeric_customer_data, cat_customer_data], axis=1)

    return customer_data
示例#9
0
文件: manip.py 项目: krashr-ds/DS
def doCleanupEncode(X,
                    y=None,
                    cat=None,
                    oh=None,
                    binary=None,
                    loo=None,
                    woe=None,
                    lp_cols=None,
                    NoData=True):
    from enrich import replaceCVs
    from enrich import one_hot_encode
    from category_encoders import BinaryEncoder
    from category_encoders import OneHotEncoder
    from category_encoders import WOEEncoder
    from category_encoders import LeaveOneOutEncoder

    if NoData is False:
        if cat is not None | oh is not None:
            # translate associated columns' null, NaN, blank and 9 values to zero
            X = replaceCVs(X, cat + oh, [np.nan, 9, "", " "], 0)

    if oh is not None:
        if NoData:
            ec = OneHotEncoder(cols=oh,
                               use_cat_names=True,
                               return_df=True,
                               handle_unknown='indicator',
                               handle_missing='indicator').fit(X)
            X = ec.fit_transform(X)
            # dropping these columns did not help performance
            # for o in oh:
            #    stem = o.split("_")[1]
            #    d1 = "L_" + stem + "_-1"
            #    d2 = "L_" + stem + "_nan"
            #    print("DROPPING ", d1, " ", d2, "\n")
            #    X.drop(d1, axis=1, errors='ignore', inplace=True)
            #    X.drop(d2, axis=1, errors='ignore', inplace=True)
        else:
            # one-hot encode, then drop 0 if created
            for oh_c in oh:
                X = one_hot_encode(X, oh_c, False)
                X.drop(0, axis=1, errors='ignore', inplace=True)

    if binary is not None:
        # binary encode binary columns
        if NoData:
            enc = BinaryEncoder(cols=binary,
                                drop_invariant=True,
                                return_df=True,
                                handle_unknown='indicator').fit(X)
            X = enc.transform(X)
        else:
            enc = BinaryEncoder(cols=binary,
                                drop_invariant=True,
                                return_df=True).fit(X)
            X = enc.transform(X)

    if woe is not None:
        # use weight of evidence on woe columns
        for w in woe:
            X[w] = X[w].fillna('NoData')

        wenc = WOEEncoder(cols=woe).fit(X, y)
        X = wenc.transform(X).round(2)

    if loo is not None:
        # use leave one out on loo columns
        for l in loo:
            X[l] = X[l].fillna('NoData')

        lenc = LeaveOneOutEncoder(cols=loo, return_df=True).fit(X, y)
        X = lenc.transform(X).round(2)

    # Cast all to int64
    # X = X.astype("int64")

    if lp_cols is not None:
        # drop least predictive
        X.drop(lp_cols, axis=1, errors="ignore", inplace=True)

    X.reset_index(drop=True, inplace=True)
    return X
示例#10
0
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
train.head()
train.info()
test_id = test['id'] # save for submission
del train['id']
del test['id']
train['type'].unique(), train['color'].unique()
sns.violinplot(x='bone_length', y='type', data=train)
sns.boxplot(x='hair_length', y='type', data=train)
sns.pairplot(train)
from category_encoders import OneHotEncoder

encoder = OneHotEncoder(cols=['color'], use_cat_names=True)

train = encoder.fit_transform(train)
test = encoder.fit_transform(test)
train.head()
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

encoder.fit(train['type'])

print(encoder.classes_)

train['type_no'] = encoder.transform(train['type'])
train.head()
sns.heatmap(train.corr(), xticklabels=list(train), yticklabels=list(train))
target = train['type_no'] # for visualizations
target_string = train['type'] # for final predictions
示例#11
0
def oneHotEncoding(df,column):
  from category_encoders import OneHotEncoder
  encoder=OneHotEncoder(cols=[column])
  df = encoder.fit_transform(df)
  return df
    def _encode_categories(self):
        """
        This private method stands for encoding categorical variables. Label encoding used for ordinal categories and
        one-hot encoding used for nominal categories.
        """

        logging.info(f'#{self._index()} - Encoding categorical columns...')

        def encode(data):
            # encode Sex column
            data['Sex'] = data['Sex'] == 'male'

            # encode Name column
            name_cols = data['Name'].apply(lambda x: pd.Series(
                [str(x).split(",")[0],
                 str(x).split(", ")[1].split(".")[0]],
                index=['Family name', 'Title']))
            data = data.join(name_cols)

            # identify Titles with same meaning
            data['Title'].replace({
                'Mlle': 'Miss',
                'Ms': 'Miss',
                'Mme': 'Mrs'
            },
                                  inplace=True)

            # group rare Titles
            title_names = (data['Title'].value_counts() < 10)
            data['Title'] = data['Title'].apply(lambda x: 'Misc'
                                                if title_names.loc[x] else x)

            # create Family size and Alone column from SibSp, Parch cols
            data['Family size'] = data['SibSp'] + data['Parch'] + 1
            data['Alone'] = data['Family size'] == 1

            # make 5 equal size groups from Fares
            data['Fare'] = pd.qcut(data['Fare'], 5, labels=False)

            # make 5 groups from Ages
            data['Age'] = pd.cut(data['Age'], 5, labels=False)

            # rename columns and delete unnecessary features
            data = data.rename(columns={
                'Sex': 'Male',
                'Fare': 'FareBins',
                'Age': 'AgeBins'
            })
            data.drop(['Name', 'SibSp', 'Parch'], axis=1, inplace=True)

            return data

        self.X = encode(self.X)
        self.X_test = encode(self.X_test)

        for col in self.X.columns:
            if self.X[col].dtype != 'float64':
                table = self.X.join(self.y)[[col, 'Survived'
                                             ]].groupby(col,
                                                        as_index=False).mean()
                table['Survived'] = (table['Survived'] * 100).map(
                    '{:.2f} %'.format)
                logging.info(
                    f'Survival ratio by: {col}\n{table}\n{"-" * 10}\n')

        one_hot_encoder = OneHotEncoder(use_cat_names=True)
        one_hot_columns = one_hot_encoder.fit_transform(
            self.X[['Title', 'Embarked']])
        one_hot_columns_test = one_hot_encoder.transform(
            self.X_test[['Title', 'Embarked']])
        self.X = self.X.join(one_hot_columns)
        self.X_test = self.X_test.join(one_hot_columns_test)

        self.X.drop(['Family name', 'Title', 'Embarked'], axis=1, inplace=True)
        self.X_test.drop(['Family name', 'Title', 'Embarked'],
                         axis=1,
                         inplace=True)

        logging.info(f'#{self._step_index} - DONE!')
示例#13
0
# x: data will receive all columns except the classe (income) as inplace=Flase will not drop the column, y: classes
x = df.drop('income', axis=1, inplace=False)
y = df.income

# TRAIN and TEST DATA
# split train and test 70x30
x_train, x_test, y_train, y_test = train_test_split(x, y)

# select non numeric columns - decision tree
#var = df.select_dtypes(include='object')
# print(var.head())
df.select_dtypes(include='object')

# OneHotEncoder: Categorical column to integer
ohe = OneHotEncoder(use_cat_names=True)
x_train = ohe.fit_transform(x_train)
x_train.head()

# StandardScaler - pre-processor to put numerical column in the same scale
scaler = StandardScaler().fit(x_train)

scaler

values_scale = scaler.transform(x_train)
values_scale[:10]
x_train = scaler.transform(x_train)

# generate the model - could be any model
# instance of the classifier decision tree and train the model
clf_tree = tree.DecisionTreeClassifier()
clf_tree = clf_tree.fit(x_train, y_train)
示例#14
0
class Encoder():
    encode_methods = {
        'OrdinalEncoder': OrdinalEncoder,
        'OneHotEncoder': OneHotEncoder,
        'CountEncoder': CountEncoder,
        'TargetEncoder': TargetEncoder,
    }

    # spark_encode_methods = {
    #     'mean_encoder':,
    #     'target_encoder':,
    #     'label_encoder':,
    #     'onehot_encoder'
    # }
    # target_encoder,mean_encoder在编码时,不能够把训练集和验证机concat在一起进行编码
    # label_encoder,onehot_encoder可以

    def __init__(self,
                 sparksess=None,
                 logdir='/encoder',
                 handle_unknown='-99999',
                 save_encoder=False):
        self.spark = sparksess
        self.logdir = logdir
        self.save_encoder

        self.ordinal_encoder_features = []
        self.onehot_encoder_features = []
        self.count_encoder_features = []
        self.target_encoder_features = []
        self.ordinal_encoder = OrdinalEncoder(
            cols=self.ordinal_encoder_features,
            return_df=True,
            handle_unknown=handle_unknown)
        self.onehot_encoder = OneHotEncoder(cols=self.onehot_encoder_features,
                                            return_df=True,
                                            handle_unknown=handle_unknown)
        self.count_encoder = CountEncoder(cols=self.count_encoder_features,
                                          return_df=True,
                                          handle_unknown=handle_unknown)
        self.target_encoder = TargetEncoder(cols=self.target_encoder_features,
                                            return_df=True,
                                            handle_unknown=handle_unknown)

    def fit(self,
            x_train,
            x_val=None,
            y_train=None,
            y_val=None,
            method_mapper=None):
        """
        Parameters
        ----------

        x_train: pd.DataFrame

        x_val: pd.DataFrame

        y_train: pd.DataFrame

        y_val: pd.DataFrame

        method_mapper: dict
            a mapping of feature to EncodeMethod
            example mapping: 
            {
                'feature1': OrdinalEncoder,
                'feature2': OneHotEncoder,
                'feature3': CountEncoder,
                'feature4': TargetEncoder,
            }
        """
        for feat in method_mapper:
            if method_mapper[feat] == 'OrdinalEncoder':
                self.ordinal_encoder_features.append(feat)
            elif method_mapper[feat] == 'OneHotEncoder':
                self.onehot_encoder_features.append(feat)
            elif method_mapper[feat] == 'CountEncoder':
                self.count_encoder_features.append(feat)
            elif method_mapper[feat] == 'TargetEncoder':
                self.target_encoder_features.append(feat)
            else:
                raise ValueError(
                    '编码方式只支持[OrdinalEncoder, OneHotEncoder, CountEncoder, TargetEncoder], 接收到%s'
                    % feat)

        if self.spark is None:
            if len(self.ordinal_encoder_features) != 0 or len(
                    self.onehot_encoder_features) != 0:
                x_whole = x_train.append(x_val)
                y_whole = None
                if not y_train is None and not y_val is None:
                    y_whole = y_train.append(y_val)

                x_whole = self.ordinal_encoder.fit_transform(x_whole, y_whole)
                x_whole = self.onehot_encoder.fit_transform(x_whole, y_whole)
                x_train = x_whole[:len(x_train)]
                x_val = x_whole[len(x_train):]

            x_train = self.count_encoder.fit_transform(x_train, y_train)
            x_val = self.count_encoder.transform(x_val, y_val)
            x_train = self.target_encoder.fit_transform(x_train, y_train)
            x_val = self.target_encoder.transform(x_val, y_val)

            if self.save_encoder:
                self.save_encoder()
        return x_train, y_train, x_val, y_val

    def transform(self, x, y=None):
        x = self.ordinal_encoder.transform(x, y)
        x = self.onehot_encoder.transform(x, y)
        x = self.count_encoder.transform(x, y)
        x = self.target_encoder.transform(x, y)
        return x, y

    def fit_transform(self,
                      x_train,
                      x_val=None,
                      y_train=None,
                      y_val=None,
                      method_mapper=None):
        """
        Parameters
        ----------

        x_train: pd.DataFrame

        x_val: pd.DataFrame

        y_train: pd.DataFrame

        y_val: pd.DataFrame
        
        method_mapper: dict
            a mapping of feature to EncodeMethod
            example mapping: 
            {
                'feature1': OrdinalEncoder,
                'feature2': OneHotEncoder,
                'feature3': CountEncoder,
                'feature4': TargetEncoder,
            }
        """
        self.fit(x_train, x_val, y_train, y_val, method_mapper)
        x_train, y_train = self.transform(x_train, y_train)
        if x_val is not None:
            x_val, y_val = self.transform(x_val, y_val)
        return x_train, y_train, x_val, y_val

    def save_encoder(self):
        now = time.strftime("%Y-%m-%d-%H_%M_%S", time.localtime(time.time()))
        os.makedirs(os.path.join(self.logdir, now))

        with open(os.path.join(self.logdir, now, 'OrdinalEncoder.pkl'),
                  'wb') as f:
            pickle.dump(self.ordinal_encoder, f)
        with open(os.path.join(self.logdir, now, 'OneHotEncoder.pkl'),
                  'wb') as f:
            pickle.dump(self.onehot_encoder, f)
        with open(os.path.join(self.logdir, now, 'CountEncoder.pkl'),
                  'wb') as f:
            pickle.dump(self.count_encoder, f)
        with open(os.path.join(self.logdir, now, 'TargetEncoder.pkl'),
                  'wb') as f:
            pickle.dump(self.target_encoder, f)

        with open(
                os.path.join(self.logdir, now, 'OrdinalEncoderFeatures.json'),
                'w') as f:
            json.dump(self.ordinal_encoder_features, f)
        with open(os.path.join(self.logdir, now, 'OneHotEncoderFeatures.json'),
                  'w') as f:
            json.dump(self.onehot_encoder_features, f)
        with open(os.path.join(self.logdir, now, 'CountEncoderFeatures.json'),
                  'w') as f:
            json.dump(self.count_encoder_features, f)
        with open(os.path.join(self.logdir, now, 'TargetEncoderFeatures.json'),
                  'w') as f:
            json.dump(self.target_encoder_features, f)

    def load_encoder(self, logdir=None):
        with open(os.path.join(self.logdir, 'OrdinalEncoder.pkl'), 'wb') as f:
            pickle.dump(self.ordinal_encoder, f)
        with open(os.path.join(self.logdir, 'OneHotEncoder.pkl'), 'wb') as f:
            pickle.dump(self.onehot_encoder, f)
        with open(os.path.join(self.logdir, 'CountEncoder.pkl'), 'wb') as f:
            pickle.dump(self.count_encoder, f)
        with open(os.path.join(self.logdir, 'TargetEncoder.pkl'), 'wb') as f:
            pickle.dump(self.target_encoder, f)

        with open(os.path.join(self.logdir, 'OrdinalEncoderFeatures.json'),
                  'w') as f:
            json.dump(self.ordinal_encoder_features, f)
        with open(os.path.join(self.logdir, 'OneHotEncoderFeatures.json'),
                  'w') as f:
            json.dump(self.onehot_encoder_features, f)
        with open(os.path.join(self.logdir, 'CountEncoderFeatures.json'),
                  'w') as f:
            json.dump(self.count_encoder_features, f)
        with open(os.path.join(self.logdir, 'TargetEncoderFeatures.json'),
                  'w') as f:
            json.dump(self.target_encoder_features, f)
示例#15
0
文件: umap_vis.py 项目: sungreong/TIL
class CombineUmap(object) :
    trans_cat_names = None
    numeric_mapper = None
    cat_mapper = None
    intersection_mapper = None 
    union_mapper = None 
    contrast_mapper = None 
    
    def __init__(self, num_info:Dict[str,str], cat_info:Dict[str,str], target:str,**kwargs ) -> None:
        super().__init__()
        self.num_info = num_info
        self.cat_info = cat_info
        self.target = target
        self.assign_num_scaler()
        self.assign_cat_scaler()
    def assign_num_scaler(self,) :
        self.num_method = self.num_info.get("method", None)
        self.num_cols = self.num_info.get("cols", [])
        if self.num_method is None : 
            self.num_scaler = Empty() 
        elif self.num_method == "RobustScaler" :
            self.num_scaler = RobustScaler()
        else :
            raise NotImplementedError("아직 나머지 구현 안함")
    def assign_cat_scaler(self,) :    
        self.cat_method = self.cat_info.get("method", None)
        self.cat_cols = self.cat_info.get("cols", [])
        if self.cat_method is None : 
            self.cat_encoder = Empty() 
        elif self.cat_method == "OrdinalEncoder" :
            self.cat_encoder = OrdinalEncoder(cols = self.cat_cols)
        elif self.cat_method == "OneHotEncoder" :
            self.cat_encoder = OneHotEncoder(cols = self.cat_cols)
        else :
            raise NotImplementedError("아직 나머지 구현 안함")
    def fit(self, df:pd.DataFrame, 
            num_kwargs={"n_neighbors":15, "random_state":42,"n_jobs":20},
            cat_kwargs={"metric":"dice", "n_neighbors" : 150, "random_state" : 42,"n_jobs":20}) :
        if self.num_cols != [] :
            df = self.scale_num(df)
            self.numeric_mapper = umap.UMAP(**num_kwargs).fit(df[self.num_cols])
        if self.cat_cols != [] :
            df = self.encode_cat(df)
            self.cat_mapper = umap.UMAP(**cat_kwargs).fit(df[self.trans_cat_names])
        return self 
    def transform(self,df:pd.DataFrame) :
        result = [None , None]
        if self.num_cols != [] :
            result[0] = self.num_transform(df)
        if self.cat_cols != [] :
            result[1] = self.cat_transform(df)
        return result
    def make_new_mapper(self,) :
        if (self.numeric_mapper is not None) & (self.cat_mapper is not None) :
            self.intersection_mapper = self.numeric_mapper * self.cat_mapper
            self.union_mapper = self.numeric_mapper + self.cat_mapper
            self.contrast_mapper = self.numeric_mapper - self.cat_mapper
            print("make new mapper 1.intersection_mapper 2.union_mapper 3.contrast_mapper")
        return self
    def num_transform(self,df:pd.DataFrame) :
        df = self.scale_num(df)
        return self.numeric_mapper.transform(df[self.num_cols])
    def cat_transform(self, df:pd.DataFrame) :
        df = self.encode_cat(df)
        return self.cat_mapper.transform(df[self.trans_cat_names])
    def vis_basic(self,embedding:np.array,target=None,classes=None) :
        _, ax = plt.subplots(1, figsize=(14, 10))
        plt.scatter(*embedding.T, s=0.3, c=target, cmap='Spectral', alpha=1.0)
        plt.setp(ax, xticks=[], yticks=[])
        if classes is not None :
            num_class = len(classes)
            cbar = plt.colorbar(boundaries=np.arange(num_class+1)-0.5)
            cbar.set_ticks(np.arange(num_class))
            cbar.set_ticklabels(classes)
        plt.title('Embedded via UMAP');
        return self
    def vis_diagnostic(self, mapper,diagnostic_type="vq") :
        """[summary]

        Args:
            mapper ([type]): [description]
            diagnostic_type (str, optional): [description]. Defaults to "vq".

        Returns:
            [type]: [description]
        example 
        diagnostic_type= ["vq","local_dim","neighborhood","pca"],
        """
        return umap.plot.diagnostic(mapper, diagnostic_type=diagnostic_type)
    
    def vis_interactive(self,mapper,**kwargs) :
        umap.plot.output_notebook()
        p = umap.plot.interactive(mapper,**kwargs)
        umap.plot.show(p)
        return self
    
    def vis_connectivity(self,mapper ,**kwargs) :
        #edge_bundling='hammer'
        return umap.plot.connectivity(mapper, show_points=True,**kwargs)
    
    def vis_points(self, mapper, values=None,**kwargs) :
        """[summary]

        Args:
            mapper ([type]): [description]
            values ([type]): [description]
            
        Returns:
            [type]: [description]
        kwargs example
        {"theme" : "fire","background":"black"}
        """
        return umap.plot.points(mapper, values=values,   **kwargs)
        
    def scale_num(self, df:pd.DataFrame)  :
        df[self.num_cols]= self.num_scaler.fit_transform(df[self.num_cols])
        return df 
    
    def encode_cat(self,df:pd.DataFrame)  :
        df = self.cat_encoder.fit_transform(df)
        if self.cat_method == "OneHotEncoder" :
            self.trans_cat_names = self.get_onehot_names()
        elif self.cat_method == "OrdinalEncoder" :
            self.trans_cat_names = self.get_label_names()    
        return df 
    
    def get_label_names(self,) : 
        return self.cat_encoder.cols
    
    def get_onehot_names(self,) :
        onehot_names = [feature_name for feature_name in self.cat_encoder.feature_names if any([ re.search( f"^{i}_", feature_name) for i in self.cat_encoder.cols])]
        return onehot_names 
# Convert price from object to float
nyc["price"] = nyc["price"].replace('[\$,]', '', regex=True).astype(float)

# Convert bathrooms, bedrooms, and beds from float to int
nyc = nyc.astype({'bathrooms': int, 'bedrooms': int, 'beds': int})

# Create target vector
y = nyc[target]

# Create feature matrix
X = nyc[features]

# Instantiate and apply One Hot Encoder to room_type
ohe = OneHotEncoder(use_cat_names=True)
X_transform = ohe.fit_transform(X)

# Instantiate model
model = LinearRegression()

# Fit model
model_lr = model.fit(X_transform, y)


def predict_price(user_input):
    # Store prediction
    user_input_ = np.array(user_input)
    user_input = user_input_.reshape(1, -1)
    prediction = model_lr.predict(user_input)
    return prediction
示例#17
0
#X_train_encoded.head()


X = dataset[:,0:5]
y = dataset[:,5]
y=np.reshape(y, (-1,1))
scaler_x = MinMaxScaler()
scaler_y = MinMaxScaler()

#scaler_y.fit(y)
#yscale=scaler_y.transform(y)
#X = StandardScaler().fit_transform(X)
#y = StandardScaler().fit_transform(y.reshape(len(y),1))[:,0]
X_train, X_test, y_train, y_test = train_test_split(X, y)

X_train_encoded = encoder.fit_transform(X_train)
scaler_x.fit(X_train_encoded)
xscale=scaler_x.transform(X_train_encoded)

'''model = Sequential()
model.add(Dense(12, input_dim=5, kernel_initializer='normal', activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='linear'))
model.compile(loss='mse', optimizer='adam', metrics=['mse','mae'])
history = model.fit(X_train, y_train, epochs=150, batch_size=50,  verbose=1, validation_split=0.2)'''


class Item(BaseModel):
    """Use this data model to parse the request body JSON."""
    Strain:str = Field(..., example="13-Dawgs")
    Type: str = Field(..., example="sativa")