예제 #1
0
def lesson_6():
    # -------------------------
    # Code from previous lesson
    # -------------------------
    # Load data
    melbourne_data = pd.read_csv(melbourne_file_path)
    # Filter rows with missing price values
    filtered_melbourne_data = melbourne_data.dropna(axis=0)
    # Choose target and features
    y = filtered_melbourne_data.Price
    melbourne_features = [
        'Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 'YearBuilt',
        'Lattitude', 'Longtitude'
    ]
    X = filtered_melbourne_data[melbourne_features]
    # Split data into training and validation data,
    train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=0)

    # -----------------
    # Start of tutorial
    # -----------------
    # Build a random forest model
    forest_model = RandomForestRegressor(random_state=1)
    forest_model.fit(train_X, train_y)
    melb_preds = forest_model.predict(val_X)
    print_("MAE for a Random Forest", 0)
    print_(mean_absolute_error(val_y, melb_preds))
예제 #2
0
파일: tutorials.py 프로젝트: raul23/ML-labs
def lesson_7():
    print_("LESSON 7: Data Leakage", 0, 1)
    X, y = load_data_from_ex_7()

    # Since there is no preprocessing, we don't need a pipeline (used anyway as best practice!)
    my_pipeline = make_pipeline(RandomForestClassifier(n_estimators=100))
    cv_scores = cross_val_score(my_pipeline, X, y, cv=5, scoring='accuracy')

    print("Cross-validation accuracy: %f" % cv_scores.mean())

    expenditures_cardholders = X.expenditure[y]
    expenditures_noncardholders = X.expenditure[~y]

    print('\nFraction of those who did not receive a card and had no expenditures: %.2f' \
          % ((expenditures_noncardholders == 0).mean()))
    print('Fraction of those who received a card and had no expenditures: %.2f' \
          % ((expenditures_cardholders == 0).mean()))

    # We would run a model without target leakage as follows:
    # Drop leaky predictors from dataset
    potential_leaks = ['expenditure', 'share', 'active', 'majorcards']
    X2 = X.drop(potential_leaks, axis=1)

    # Evaluate the model with leaky predictors removed
    cv_scores = cross_val_score(my_pipeline, X2, y, cv=5, scoring='accuracy')

    print("\nCross-val accuracy: %f" % cv_scores.mean())
예제 #3
0
def ex_5():
    print_("Exercise 5: Distributions", 0, 1)

    # ---------------------
    # Step 1: Load the data
    # ---------------------
    # Fill in the line below to read the (benign) file
    cancer_b_data = pd.read_csv(cancer_b_filepath, index_col="Id")

    # Fill in the line below to read the (malignant) file
    cancer_m_data = pd.read_csv(cancer_m_filepath, index_col="Id")

    # -----------------------
    # Step 2: Review the data
    # -----------------------
    # Print the first five rows of the (benign) data
    print_("First 5 rows of the benign data", 0)
    print_(cancer_b_data.head())

    # Print the first five rows of the (malignant) data
    print_("First 5 rows of the malignant data", 0)
    print_(cancer_m_data.head())

    # ---------------------------------
    # Step 3: Investigating differences
    # ---------------------------------
    # Part A
    # Histograms for benign and maligant tumors
    sns.distplot(a=cancer_b_data['Area (mean)'],
                 label="Benign tumors",
                 kde=False)
    sns.distplot(a=cancer_m_data['Area (mean)'],
                 label="Malignant tumors",
                 kde=False)
    plt.legend()
    plt.show()

    # ----------------------------
    # Step 4: A very useful column
    # ----------------------------
    # Part A
    # KDE plots for benign and malignant tumors
    sns.kdeplot(data=cancer_b_data['Radius (worst)'],
                label="Benign Tumors",
                shade=True)
    sns.kdeplot(data=cancer_m_data['Radius (worst)'],
                label="Malignant tumors",
                shade=True)

    # Add title
    plt.title(
        "Distribution in values for 'Radius (worst)', for both benign and malignant tumors"
    )

    # Force legend to appear
    plt.legend()

    plt.show()
예제 #4
0
파일: tutorials.py 프로젝트: raul23/ML-labs
def lesson_6():
    print_("LESSON 6: XGBoost", 0, 1)
    X_train, X_valid, y_train, y_valid = load_data_for_lesson_6()

    my_model = XGBRegressor()
    my_model.fit(X_train, y_train)

    predictions = my_model.predict(X_valid)
    print("Mean Absolute Error: " +
          str(mean_absolute_error(predictions, y_valid)))
예제 #5
0
파일: exercises.py 프로젝트: raul23/ML-labs
def ex_6():
    print_("Exercise 6: XGBoost", 0, 1)
    X_train, X_valid, y_train, y_valid, X_test = load_data_for_ex_6()
    # -------------------
    # Step 1: Build model
    # -------------------
    # Part A
    # Define the model
    my_model_1 = XGBRegressor(random_state=0)

    # Fit the model
    my_model_1.fit(X_train, y_train)

    # Part B
    # Get predictions
    predictions_1 = my_model_1.predict(X_valid)

    # Part C
    # Calculate MAE
    mae_1 = mean_absolute_error(predictions_1, y_valid)
    print("Mean Absolute Error:", mae_1)

    # -------------------------
    # Step 2: Improve the model
    # -------------------------
    my_model_2 = XGBRegressor(random_state=0, n_estimators=1000, learning_rate=0.05, n_jobs=4)

    # Fit the model
    my_model_2.fit(X_train, y_train,
                   early_stopping_rounds=5,
                   eval_set=[(X_valid, y_valid)],
                   verbose=False)

    # Get predictions
    predictions_2 = my_model_2.predict(X_valid)

    # Calculate MAE
    mae_2 = mean_absolute_error(predictions_2, y_valid)
    print("Mean Absolute Error:" , mae_2)

    # -----------------------
    # Step 3: Break the model
    # -----------------------
    my_model_3 = XGBRegressor(random_state=0, n_estimators=10, learning_rate=0.9)

    # Fit the model
    my_model_3.fit(X_train, y_train)

    # Get predictions
    predictions_3 = my_model_3.predict(X_valid)

    # Calculate MAE
    mae_3 = mean_absolute_error(predictions_3, y_valid)
    print("Mean Absolute Error:", mae_3)
예제 #6
0
파일: tutorials.py 프로젝트: raul23/ML-labs
def lesson_5():
    print_("LESSON 5: Cross-Validation", 0, 1)
    X, y = load_data_for_lesson_5()

    my_pipeline = Pipeline(steps=[('preprocessor', SimpleImputer(
    )), ('model', RandomForestRegressor(n_estimators=50, random_state=0))])

    # Multiply by -1 since sklearn calculates *negative* MAE
    scores = -1 * cross_val_score(
        my_pipeline, X, y, cv=5, scoring='neg_mean_absolute_error')

    print("MAE scores:\n", scores)

    print("\nAverage MAE score (across experiments):")
    print(scores.mean(), end="\n\n")
예제 #7
0
def ex_1():
    print_("Exercise 1: Hello, Seaborn", 0, 1)

    # ---------------------
    # Step 2: Load the data
    # ---------------------
    fifa_data = pd.read_csv(fifa_filepath, index_col="Date", parse_dates=True)

    # ---------------------
    # Step 3: Plot the data
    # ---------------------
    # Set the width and height of the figure
    plt.figure(figsize=(16, 6))

    # Line chart showing how FIFA rankings evolved over time
    sns.lineplot(data=fifa_data)

    plt.show()
예제 #8
0
파일: tutorials.py 프로젝트: raul23/ML-labs
def load_data_from_ex_7():
    # Read the data
    data = pd.read_csv(credit_card_file_path,
                       true_values=['yes'],
                       false_values=['no'])

    # Select target
    y = data.card

    # Select predictors
    X = data.drop(['card'], axis=1)

    print("Number of rows in the dataset:", X.shape[0])
    print()
    print_("First 5 rows from X", 0)
    print_(X.head())

    return X, y
예제 #9
0
def ex_6():
    print_("Exercise 6: Choosing Plot Types and Custom Styles", 0, 1)

    spotify_data = pd.read_csv(spotify_filepath,
                               index_col="Date",
                               parse_dates=True)

    # ----------------------
    # Try out seaborn styles
    # ----------------------
    # Change the style of the figure
    sns.set_style("white")

    # Line chart
    plt.figure(figsize=(12, 6))
    sns.lineplot(data=spotify_data)

    plt.show()
예제 #10
0
파일: tutorials.py 프로젝트: raul23/ML-labs
def lesson_6():
    print_("Lesson 6: Choosing Plot Types and Custom Styles", 0, 1)

    spotify_data = pd.read_csv(spotify_filepath, index_col="Date", parse_dates=True)

    # Line chart
    plt.figure(figsize=(12, 6))
    sns.lineplot(data=spotify_data)
    plt.show()

    # Seaborn has five different themes: (1)"darkgrid", (2)"whitegrid",
    # (3)"dark", (4)"white", and (5)"ticks"

    # Change the style of the figure to the "dark" theme
    sns.set_style("dark")

    # Line chart
    plt.figure(figsize=(12, 6))
    sns.lineplot(data=spotify_data)
    plt.show()
예제 #11
0
파일: tutorials.py 프로젝트: raul23/ML-labs
def lesson_4():
    print_("LESSON 4: Pipelines", 0, 1)
    # -------
    # Example
    # -------
    X_train, X_valid, y_train, y_valid, numerical_cols, categorical_cols = load_data_for_lesson_4(
    )

    print_(
        "First 5 rows from the train data",
        0,
    )
    print_(X_train.head())

    # Build pipeline in 3 steps

    # ----------------------------------
    # Step 1: Define Preprocessing Steps
    # ----------------------------------
    # The code below:
    #
    # - imputes missing values in numerical data, and
    # - imputes missing values and applies a one-hot encoding to categorical data.

    # Preprocessing for numerical data
    numerical_transformer = SimpleImputer(strategy='constant')

    # Preprocessing for categorical data
    categorical_transformer = Pipeline(
        steps=[('imputer', SimpleImputer(strategy='most_frequent')
                ), ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    # Bundle preprocessing for numerical and categorical data
    preprocessor = ColumnTransformer(transformers=[(
        'num', numerical_transformer,
        numerical_cols), ('cat', categorical_transformer, categorical_cols)])

    # ------------------------
    # Step 2: Define the Model
    # ------------------------
    model = RandomForestRegressor(n_estimators=100, random_state=0)

    # ----------------------------------------
    # Step 3: Create and Evaluate the Pipeline
    # ----------------------------------------
    # Define a pipeline that bundles the preprocessing and modeling steps

    # Bundle preprocessing and modeling code in a pipeline
    my_pipeline = Pipeline(steps=[('preprocessor',
                                   preprocessor), ('model', model)])

    # Preprocessing of training data, fit model
    my_pipeline.fit(X_train, y_train)

    # Preprocessing of validation data, get predictions
    preds = my_pipeline.predict(X_valid)

    # Evaluate the model
    score = mean_absolute_error(y_valid, preds)
    print('MAE:', score)
예제 #12
0
파일: exercises.py 프로젝트: raul23/ML-labs
def ex_1():
    print_("Exercise 1: Introduction", 0, 1)
    # Read the data
    X_full = pd.read_csv(train_file_path, index_col='Id')
    X_test_full = pd.read_csv(test_file_path, index_col='Id')

    # Obtain target and predictors
    y = X_full.SalePrice
    features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
    X = X_full[features].copy()
    X_test = X_test_full[features].copy()

    # Break off validation set from training data
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                          random_state=0)

    print_("First 5 rows from the train dataset", 0)
    print_(X_train.head())

    # -------------------------------
    # Step 1: Evaluate several models
    # -------------------------------
    # Define five different random forest models
    model_1 = RandomForestRegressor(n_estimators=50, random_state=0)
    model_2 = RandomForestRegressor(n_estimators=100, random_state=0)
    model_3 = RandomForestRegressor(n_estimators=100, criterion='mae', random_state=0)
    model_4 = RandomForestRegressor(n_estimators=200, min_samples_split=20, random_state=0)
    model_5 = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=0)

    models = [model_1, model_2, model_3, model_4, model_5]

    # Function for comparing different models
    def score_model(model, X_t=X_train, X_v=X_valid, y_t=y_train, y_v=y_valid):
        model.fit(X_t, y_t)
        preds = model.predict(X_v)
        return mean_absolute_error(y_v, preds)

    for i in range(0, len(models)):
        mae = score_model(models[i])
        print("Model %d MAE: %d" % (i + 1, mae))

    # Fill in the best model
    best_model = model_3

    # ---------------------------------
    # Step 2: Generate test predictions
    # ---------------------------------
    # Create a Random Forest model
    my_model = RandomForestRegressor(n_estimators=100, criterion='mae', random_state=0)
    # Fit the model to the training data
    my_model.fit(X, y)

    # Generate test predictions
    preds_test = my_model.predict(X_test)

    # Save predictions in format used for competition scoring
    output = pd.DataFrame({'Id': X_test.index,
                           'SalePrice': preds_test})
    output.to_csv('ex1_submission.csv', index=False)
예제 #13
0
파일: tutorials.py 프로젝트: raul23/ML-labs
def lesson_3():
    print_("Lesson 3: Bar Charts and Heatmaps", 0, 1)

    # -------------
    # Load the data
    # -------------
    flight_data = pd.read_csv(flight_filepath, index_col="Month")

    # ----------------
    # Examine the data
    # ----------------
    print_("The whole data", 0)
    print_(flight_data)

    # ---------
    # Bar chart
    # ---------
    # Set the width and height of the figure
    plt.figure(figsize=(10, 6))

    # Add title
    plt.title("Average Arrival Delay for Spirit Airlines Flights, by Month")

    # Bar chart showing average arrival delay for Spirit Airlines flights by month
    sns.barplot(x=flight_data.index, y=flight_data['NK'])

    # Add label for vertical axis
    plt.ylabel("Arrival delay (in minutes)")

    plt.show()

    # Important: You must select the indexing column with flight_data.index,
    # and it is not possible to use flight_data['Month'] (which will return an
    # error). This is because when we loaded the dataset, the "Month" column
    # was used to index the rows. We always have to use this special notation
    # to select the indexing column.

    # -------
    # Heatmap
    # -------
    # Set the width and height of the figure
    plt.figure(figsize=(14, 7))

    # Add title
    plt.title("Average Arrival Delay for Each Airline, by Month")

    # Heatmap showing average arrival delay for each airline by month
    # NOTE: annot=True - This ensures that the values for each cell appear on
    # the chart. (Leaving this out removes the numbers from each of the cells!)
    sns.heatmap(data=flight_data, annot=True)

    # Add label for horizontal axis
    plt.xlabel("Airline")

    plt.show()
예제 #14
0
파일: bonus.py 프로젝트: raul23/ML-labs
def titanic():
    # Load Titanic train dataset
    train_data = pd.read_csv(titanic_train_file_path)
    print_("First 5 rows from Titanic train dataset", 0)
    print_(train_data.head())

    # Load test set
    test_data = pd.read_csv(titanic_test_file_path)
    print_("First 5 rows from Titanic test dataset", 0)
    print_(test_data.head())

    # Part 3: Improve your score
    # Explore a pattern: assume that all female passengers survived (and all
    # male passengers died)
    women = train_data.loc[train_data.Sex == 'female']["Survived"]
    rate_women = sum(women) / len(women)

    # Based on the train set
    print("% of women who survived:", rate_women)

    men = train_data.loc[train_data.Sex == 'male']["Survived"]
    rate_men = sum(men) / len(men)

    # Based on the train set
    print("% of men who survived:", rate_men)

    # Your first machine learning model: a random forest model
    y = train_data["Survived"]

    features = ["Pclass", "Sex", "SibSp", "Parch"]
    X = pd.get_dummies(train_data[features])
    X_test = pd.get_dummies(test_data[features])

    model = RandomForestClassifier(n_estimators=100,
                                   max_depth=5,
                                   random_state=1)
    model.fit(X, y)
    predictions = model.predict(X_test)

    output = pd.DataFrame({
        'PassengerId': test_data.PassengerId,
        'Survived': predictions
    })
    output.to_csv('my_submission.csv', index=False)
    print("Your submission was successfully saved!")
예제 #15
0
def lesson_4():
    # Load data
    melbourne_data = pd.read_csv(melbourne_file_path)
    # Filter rows with missing price values
    filtered_melbourne_data = melbourne_data.dropna(axis=0)
    # Choose target and features
    y = filtered_melbourne_data.Price
    melbourne_features = [
        'Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 'YearBuilt',
        'Lattitude', 'Longtitude'
    ]
    X = filtered_melbourne_data[melbourne_features]

    # Define model
    melbourne_model = DecisionTreeRegressor()
    # Fit model
    melbourne_model.fit(X, y)

    # Calculate the mean absolute error
    predicted_home_prices = melbourne_model.predict(X)
    mae = mean_absolute_error(y, predicted_home_prices)
    print_("Mean absolute error when using just train set", 0)
    print_(mae)

    # Split data into training and validation data, for both features and target
    # The split is based on a random number generator. Supplying a numeric value to
    # the random_state argument guarantees we get the same split every time we
    # run this script.
    train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=0)
    # Define model
    melbourne_model = DecisionTreeRegressor()
    # Fit model
    melbourne_model.fit(train_X, train_y)

    # Get predicted prices on validation data
    val_predictions = melbourne_model.predict(val_X)
    print_("Mean absolute error when using train and validation sets", 0)
    print_(mean_absolute_error(val_y, val_predictions))
예제 #16
0
파일: exercises.py 프로젝트: raul23/ML-labs
def ex_5():
    print_("Exercise 5: Cross-validation", 0, 1)
    X, y, X_test = load_data_for_ex_5()

    print_("First 5 rows from X", 0)
    print_(X.head())

    my_pipeline = Pipeline(steps=[
        ('preprocessor', SimpleImputer()),
        ('model', RandomForestRegressor(n_estimators=50, random_state=0))
    ])

    # Multiply by -1 since sklearn calculates *negative* MAE
    scores = -1 * cross_val_score(my_pipeline, X, y,
                                  cv=5,
                                  scoring='neg_mean_absolute_error')

    print("Average MAE score:", scores.mean())

    # -------------------------------
    # Step 1: Write a useful function
    # -------------------------------
    def get_score(n_estimators):
        """Return the average MAE over 3 CV folds of random forest model.

        Keyword argument:
        n_estimators -- the number of trees in the forest
        """
        my_pipeline_ = Pipeline(steps=[
            ('preprocessor', SimpleImputer()),
            ('model', RandomForestRegressor(n_estimators=n_estimators, random_state=0))
        ])
        # Multiply by -1 since sklearn calculates *negative* MAE
        scores = -1 * cross_val_score(my_pipeline_, X, y,
                                      cv=3,
                                      scoring='neg_mean_absolute_error')
        # print("\nAverage MAE score (across experiments):")
        # print(scores.mean(), end="\n\n")
        return scores.mean()

    # ---------------------------------------
    # Step 2: Test different parameter values
    # ---------------------------------------
    results = dict([(i, get_score(i)) for i in range(50, 300, 50)])

    plt.plot(list(results.keys()), list(results.values()))
    plt.show()
예제 #17
0
파일: tutorials.py 프로젝트: raul23/ML-labs
def lesson_4():
    print_("Lesson 4: Scatter Plots", 0, 1)

    # -------------------------
    # Load and examine the data
    # -------------------------
    insurance_data = pd.read_csv(insurance_filepath)
    print_("First 5 rows", 0)
    print_(insurance_data.head())

    # -------------
    # Scatter plots
    # -------------
    sns.scatterplot(x=insurance_data['bmi'], y=insurance_data['charges'])
    plt.show()

    # Draw a line that best fits the data
    sns.regplot(x=insurance_data['bmi'], y=insurance_data['charges'])
    plt.show()

    # -------------------------
    # Color-coded scatter plots
    # -------------------------
    # Use color-coded scatter plots to display the relationships between 3
    # variables
    sns.scatterplot(x=insurance_data['bmi'], y=insurance_data['charges'], hue=insurance_data['smoker'])
    plt.show()

    # sns.lmplot: adds two regression lines
    sns.lmplot(x="bmi", y="charges", hue="smoker", data=insurance_data)
    plt.show()

    # ------------------------
    # Categorical scatter plot
    # ------------------------
    # We use this sort of scatter plot to highlight the relationship between a
    # continuous and categorical variables
    sns.swarmplot(x=insurance_data['smoker'],
                  y=insurance_data['charges'])
    plt.show()
예제 #18
0
파일: tutorials.py 프로젝트: raul23/ML-labs
def lesson_1():
    print_("Lesson 1: Hello, Seaborn", 0, 1)
    # -------------
    # Load the data
    # -------------
    fifa_data = pd.read_csv(fifa_filepath, index_col="Date", parse_dates=True)

    # ----------------
    # Examine the data
    # ----------------
    print_("The first 5 rows of the data", 0)
    print_(fifa_data.head())

    # -------------
    # Plot the data
    # -------------
    # Set the width and height of the figure
    plt.figure(figsize=(16, 6))

    # Line chart showing how FIFA rankings evolved over time
    sns.lineplot(data=fifa_data)

    plt.show()
예제 #19
0
def lessons_1_to_3():
    # Load Melbourne Housing Snapshot dataset
    melbourne_data = pd.read_csv(melbourne_file_path)
    # Print a summary of the data in Melbourne data
    print_("Summary of dataset", 0)
    print_(melbourne_data.describe())

    # List of all columns in the dataset
    print_("Columns", 0)
    print_(melbourne_data.columns)

    # Drop missing values
    melbourne_data = melbourne_data.dropna(axis=0)

    # Select the prediction target (price)
    y = melbourne_data.Price
    print_("y", 0)
    print_(y)

    # Select features
    melbourne_features = [
        'Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude'
    ]
    X = melbourne_data[melbourne_features]
    print_("X", 0)
    print_(X)

    print_("Summary of X", 0)
    print_(X.describe())
    print_("First few rows of X", 0)
    print_(X.head())

    # Define model. Specify a number for random_state to ensure same results each run
    melbourne_model = DecisionTreeRegressor(random_state=1)

    # Fit model
    melbourne_model.fit(X, y)

    print_("Making predictions for the following 5 houses:", 0)
    print_(X.head())
    print_("The predictions are", 0)
    print_(melbourne_model.predict(X.head()))
예제 #20
0
def lesson_2():
    print_("Lesson 2: Categorical Encodings", 0, 1)
    ks = pd.read_csv(ks_projects_file_path,
                     parse_dates=['deadline', 'launched'])

    # Drop live projects
    ks = ks.query('state != "live"')

    # Add outcome column, "successful" == 1, others are 0
    ks = ks.assign(outcome=(ks['state'] == 'successful').astype(int))

    # Timestamp features
    ks = ks.assign(hour=ks.launched.dt.hour,
                   day=ks.launched.dt.day,
                   month=ks.launched.dt.month,
                   year=ks.launched.dt.year)

    # Label encoding
    cat_features = ['category', 'currency', 'country']
    encoder = LabelEncoder()
    encoded = ks[cat_features].apply(encoder.fit_transform)

    data_cols = ['goal', 'hour', 'day', 'month', 'year', 'outcome']
    data = ks[data_cols].join(encoded)

    # Defining  functions that will help us test our encodings
    def get_data_splits(dataframe, valid_fraction=0.1):
        valid_fraction = 0.1
        valid_size = int(len(dataframe) * valid_fraction)

        train = dataframe[:-valid_size * 2]
        # valid size == test size, last two sections of the data
        valid = dataframe[-valid_size * 2:-valid_size]
        test = dataframe[-valid_size:]

        return train, valid, test

    def train_model(train, valid):
        feature_cols = train.columns.drop('outcome')

        dtrain = lgb.Dataset(train[feature_cols], label=train['outcome'])
        dvalid = lgb.Dataset(valid[feature_cols], label=valid['outcome'])

        param = {
            'num_leaves': 64,
            'objective': 'binary',
            'metric': 'auc',
            'seed': 7,
            'verbose': -1
        }
        bst = lgb.train(param,
                        dtrain,
                        num_boost_round=1000,
                        valid_sets=[dvalid],
                        early_stopping_rounds=10,
                        verbose_eval=False)

        valid_pred = bst.predict(valid[feature_cols])
        valid_score = metrics.roc_auc_score(valid['outcome'], valid_pred)
        print(f"Validation AUC score: {valid_score:.4f}")

    # Train a model (on the baseline data)
    train, valid, test = get_data_splits(data)
    print_("Baseline (LightGBM with no categorical encoding)", 0)
    train_model(train, valid)
    print()

    # --------------
    # Count Encoding
    # --------------
    cat_features = ['category', 'currency', 'country']

    # Create the encoder
    count_enc = ce.CountEncoder()

    # Transform the features, rename the columns with the _count suffix, and join to dataframe
    # TODO: calculating the counts on the whole dataset? Should it be on the train only to avoid data leakage?
    # This is what was done in the Exercise 2
    count_encoded = count_enc.fit_transform(ks[cat_features])
    data = data.join(count_encoded.add_suffix("_count"))

    # Train a model
    train, valid, test = get_data_splits(data)
    print_("LightGBM with COUNT encoding", 0)
    train_model(train, valid)
    print()

    # ---------------
    # Target Encoding
    # ---------------
    # Create the encoder
    target_enc = ce.TargetEncoder(cols=cat_features)
    target_enc.fit(train[cat_features], train['outcome'])

    # Transform the features, rename the columns with _target suffix, and join to dataframe
    train_TE = train.join(
        target_enc.transform(train[cat_features]).add_suffix('_target'))
    valid_TE = valid.join(
        target_enc.transform(valid[cat_features]).add_suffix('_target'))

    # Train a model
    print_("LightGBM with TARGET encoding", 0)
    train_model(train_TE, valid_TE)
    print()

    # -----------------
    # CatBoost Encoding
    # -----------------
    # Create the encoder
    cb_enc = ce.TargetEncoder(cols=cat_features)
    cb_enc.fit(train[cat_features], train['outcome'])

    # Transform the features, rename the columns with _target suffix, and join to dataframe
    train_CBE = train.join(
        cb_enc.transform(train[cat_features]).add_suffix('_cb'))
    valid_CBE = valid.join(
        cb_enc.transform(valid[cat_features]).add_suffix('_cb'))

    # Train a model
    print_("LightGBM with CatBoost encoding", 0)
    train_model(train_CBE, valid_CBE)
    print()
예제 #21
0
def lesson_4():
    print_("Lesson 4: Feature Selection", 0, 1)
    ks = pd.read_csv(ks_projects_file_path,
                     parse_dates=['deadline', 'launched'])

    # Drop live projects
    ks = ks.query('state != "live"')

    # Add outcome column, "successful" == 1, others are 0
    ks = ks.assign(outcome=(ks['state'] == 'successful').astype(int))

    # Timestamp features
    ks = ks.assign(hour=ks.launched.dt.hour,
                   day=ks.launched.dt.day,
                   month=ks.launched.dt.month,
                   year=ks.launched.dt.year)

    # Label encoding
    cat_features = ['category', 'currency', 'country']
    encoder = LabelEncoder()
    encoded = ks[cat_features].apply(encoder.fit_transform)

    data_cols = ['goal', 'hour', 'day', 'month', 'year', 'outcome']
    baseline_data = ks[data_cols].join(encoded)

    cat_features = ['category', 'currency', 'country']
    interactions = pd.DataFrame(index=ks.index)
    for col1, col2 in itertools.combinations(cat_features, 2):
        new_col_name = '_'.join([col1, col2])
        # Convert to strings and combine
        new_values = ks[col1].map(str) + "_" + ks[col2].map(str)
        label_enc = LabelEncoder()
        interactions[new_col_name] = label_enc.fit_transform(new_values)
    baseline_data = baseline_data.join(interactions)

    launched = pd.Series(ks.index, index=ks.launched,
                         name="count_7_days").sort_index()
    count_7_days = launched.rolling('7d').count() - 1
    count_7_days.index = launched.values
    count_7_days = count_7_days.reindex(ks.index)

    baseline_data = baseline_data.join(count_7_days)

    def time_since_last_project(series):
        # Return the time in hours
        return series.diff().dt.total_seconds() / 3600.

    df = ks[['category', 'launched']].sort_values('launched')
    timedeltas = df.groupby('category').transform(time_since_last_project)
    timedeltas = timedeltas.fillna(timedeltas.max())

    baseline_data = baseline_data.join(
        timedeltas.rename({'launched': 'time_since_last_project'}, axis=1))

    def get_data_splits(dataframe, valid_fraction=0.1):
        valid_fraction = 0.1
        valid_size = int(len(dataframe) * valid_fraction)

        train = dataframe[:-valid_size * 2]
        # valid size == test size, last two sections of the data
        valid = dataframe[-valid_size * 2:-valid_size]
        test = dataframe[-valid_size:]

        return train, valid, test

    def train_model(train, valid):
        feature_cols = train.columns.drop('outcome')

        dtrain = lgb.Dataset(train[feature_cols], label=train['outcome'])
        dvalid = lgb.Dataset(valid[feature_cols], label=valid['outcome'])

        param = {
            'num_leaves': 64,
            'objective': 'binary',
            'metric': 'auc',
            'seed': 7
        }
        print("Training model!")
        bst = lgb.train(param,
                        dtrain,
                        num_boost_round=1000,
                        valid_sets=[dvalid],
                        early_stopping_rounds=10,
                        verbose_eval=False)

        valid_pred = bst.predict(valid[feature_cols])
        valid_score = metrics.roc_auc_score(valid['outcome'], valid_pred)
        print(f"Validation AUC score: {valid_score:.4f}")
        return bst

    # ----------------------------
    # Univariate Feature Selection
    # ----------------------------
    feature_cols = baseline_data.columns.drop('outcome')

    # Keep 5 features
    selector = SelectKBest(f_classif, k=5)

    # NOTE: we should select features using only a training set, not the whole
    # dataset we are doing here (which will be fixed next)
    X_new = selector.fit_transform(baseline_data[feature_cols],
                                   baseline_data['outcome'])
    print_("X_new (after selecting 5 best features)", 0)
    print_(X_new)

    # Fix: select features using only a training set
    feature_cols = baseline_data.columns.drop('outcome')
    train, valid, _ = get_data_splits(baseline_data)

    # Keep 5 features
    selector = SelectKBest(f_classif, k=5)

    X_new = selector.fit_transform(train[feature_cols], train['outcome'])
    print_("X_new FIXED [Using Train Only]", 0)
    print_(X_new)

    # Get back the features we've kept, zero out all other features
    selected_features = pd.DataFrame(selector.inverse_transform(X_new),
                                     index=train.index,
                                     columns=feature_cols)
    print_(
        "First 5 rows from the train set including the 5 best features only (others set at 0)",
        0)
    print_(selected_features.head())

    # Dropped columns have values of all 0s, so var is 0, drop them
    selected_columns = selected_features.columns[selected_features.var() != 0]

    # Get the valid dataset with the selected features.
    print_("Valid dataset with the selected features only", 0)
    print_(valid[selected_columns].head())

    # -----------------
    # L1 regularization
    # -----------------
    train, valid, _ = get_data_splits(baseline_data)

    X, y = train[train.columns.drop("outcome")], train['outcome']

    # Set the regularization parameter C=1
    logistic = LogisticRegression(C=1,
                                  penalty="l1",
                                  solver='liblinear',
                                  random_state=7).fit(X, y)
    model = SelectFromModel(logistic, prefit=True)

    X_new = model.transform(X)
    print_("X_new with L1 regularization", 0)
    print_(X_new)

    # Get back the kept features as a DataFrame with dropped columns as all 0s
    selected_features = pd.DataFrame(model.inverse_transform(X_new),
                                     index=X.index,
                                     columns=X.columns)

    # Dropped columns have values of all 0s, keep other columns
    selected_columns = selected_features.columns[selected_features.var() != 0]
    print_("Rejected columns: {}".format(
        selected_features.columns.difference(selected_columns).to_list()))

    # Get the valid dataset with the selected features.
    print_("Valid dataset with the selected features using L1 regularization",
           0)
    print_(valid[selected_columns].head())
예제 #22
0
def lesson_1():
    print_("Lesson 1: Baseline Model", 0, 1)
    ks = pd.read_csv(ks_projects_file_path,
                     parse_dates=['deadline', 'launched'])
    print_("First 6 rows from the Kickstarter Projects dataset", 0)
    print_(ks.head(6))

    print('Unique values in `state` column:', list(ks.state.unique()))

    # Prepare the target column
    # Drop live projects
    ks = ks.query('state != "live"')

    # Add outcome column, "successful" == 1, others are 0
    ks = ks.assign(outcome=(ks['state'] == 'successful').astype(int))

    # Convert timestamps
    ks = ks.assign(hour=ks.launched.dt.hour,
                   day=ks.launched.dt.day,
                   month=ks.launched.dt.month,
                   year=ks.launched.dt.year)

    # Prep categorical variables
    cat_features = ['category', 'currency', 'country']
    encoder = LabelEncoder()

    # Apply the label encoder to each column
    encoded = ks[cat_features].apply(encoder.fit_transform)

    # Collect all of these features in a new dataframe that we can use to train
    # a model
    #
    # Since ks and encoded have the same index and I can easily join them
    data = ks[['goal', 'hour', 'day', 'month', 'year',
               'outcome']].join(encoded)
    data.head()

    # Create training, validation, and test splits
    # Use 10% of the data as a validation set, 10% for testing, and the other
    # 80% for training.
    valid_fraction = 0.1
    valid_size = int(len(data) * valid_fraction)

    train = data[:-2 * valid_size]
    valid = data[-2 * valid_size:-valid_size]
    test = data[-valid_size:]

    # Train a model
    feature_cols = train.columns.drop('outcome')

    dtrain = lgb.Dataset(train[feature_cols], label=train['outcome'])
    dvalid = lgb.Dataset(valid[feature_cols], label=valid['outcome'])

    param = {'num_leaves': 64, 'objective': 'binary'}
    param['metric'] = 'auc'
    num_round = 1000
    bst = lgb.train(param,
                    dtrain,
                    num_round,
                    valid_sets=[dvalid],
                    early_stopping_rounds=10,
                    verbose_eval=False)

    # Make predictions & evaluate the model
    ypred = bst.predict(test[feature_cols])
    score = metrics.roc_auc_score(test['outcome'], ypred)

    print(f"Test AUC score: {score}")
예제 #23
0
def lesson_3():
    print_("Lesson 3: Feature Generation", 0, 1)
    # -----
    # Setup
    # -----
    ks = pd.read_csv(ks_projects_file_path,
                     parse_dates=['deadline', 'launched'])

    # Drop live projects
    ks = ks.query('state != "live"')

    # Add outcome column, "successful" == 1, others are 0
    ks = ks.assign(outcome=(ks['state'] == 'successful').astype(int))

    # Timestamp features
    ks = ks.assign(hour=ks.launched.dt.hour,
                   day=ks.launched.dt.day,
                   month=ks.launched.dt.month,
                   year=ks.launched.dt.year)

    # Label encoding
    cat_features = ['category', 'currency', 'country']
    encoder = LabelEncoder()
    encoded = ks[cat_features].apply(encoder.fit_transform)

    data_cols = ['goal', 'hour', 'day', 'month', 'year', 'outcome']
    baseline_data = ks[data_cols].join(encoded)

    # ------------
    # Interactions
    # ------------
    interactions = ks['category'] + "_" + ks['country']
    print_("Interactions: first 5 rows from category_country", 0)
    print_(interactions.head(5))

    # Label encode the interaction feature and add it to the data
    label_enc = LabelEncoder()
    data_interaction = baseline_data.assign(
        category_country=label_enc.fit_transform(interactions))
    print_("First 5 rows from data with the added interactions", 0)
    print_(data_interaction.head())

    # -----------------------------------
    # Number of projects in the last week
    # -----------------------------------
    # First, create a Series with a timestamp index
    launched = pd.Series(ks.index, index=ks.launched,
                         name="count_7_days").sort_index()
    print_("First 20 rows from series with the timestamp index", 0)
    print_(launched.head(20))

    count_7_days = launched.rolling('7d').count() - 1
    print_("First 20 rows from the rolling window of 7 days", 0)
    print_(count_7_days.head(20))

    # Ignore records with broken launch dates
    plt.plot(count_7_days[7:])
    plt.title("Number of projects launched over periods of 7 days")
    plt.show()

    # Adjust the index so we can join it with the other training data.
    count_7_days.index = launched.values
    count_7_days = count_7_days.reindex(ks.index)

    print_(
        "First 10 rows from the rolling window of 7 days (with index adjusted)",
        0)
    print_(count_7_days.head(10))

    # Now join the new feature with the other data again using .join since
    # we've matched the index.
    print_(
        "First 10 rows from baseline data with the new feature (count_7_days))"
    )
    print_(baseline_data.join(count_7_days).head(10))

    # ------------------------------------------------
    # Time since the last project in the same category
    # ------------------------------------------------
    def time_since_last_project(series):
        # Return the time in hours
        return series.diff().dt.total_seconds() / 3600.

    df = ks[['category', 'launched']].sort_values('launched')
    timedeltas = df.groupby('category').transform(time_since_last_project)
    print_(
        "First 20 rows from timedeltas (time since the last project in "
        "the same category)", 0)
    print_(timedeltas.head(20))
    # We get NaNs here for projects that are the first in their category.

    # Fix NaNs by using the mean or median. We'll also need to reset the index
    # so we can join it with the other data.
    # Final time since last project
    timedeltas = timedeltas.fillna(timedeltas.median()).reindex(
        baseline_data.index)
    print_("First 20 rows from timedeltas (with NaNs fixed)", 0)
    print_(timedeltas.head(20))

    # -------------------------------
    # Transforming numerical features
    # -------------------------------
    # Some models work better when the features are normally distributed
    # Transform them with the square root or natural logarithm.

    # Example: transform the goal feature using the square root and log functions

    # Square root transformation
    plt.hist(np.sqrt(ks.goal), range=(0, 400), bins=50)
    plt.title('Sqrt(Goal)')
    plt.show()

    # Log function transformation
    plt.hist(np.log(ks.goal), range=(0, 25), bins=50)
    plt.title('Log(Goal)')
    plt.show()
예제 #24
0
def lesson_2():
    print_("Lesson 2: Indexing, Selecting & Assigning", 0, 1)
    reviews = pd.read_csv(wine_file_path, index_col=0)

    # ----------------
    # Native accessors
    # ----------------
    print_("Country column from reviews", 0)
    print_(reviews.country)  # also reviews['country']

    print_("First country from the country Series", 0)
    print_(reviews.country[0])

    # ------------------
    # Indexing in pandas
    # ------------------
    # pandas' own accessor operators: loc and iloc
    #
    # NOTE: loc and iloc are row-first, column-second
    # This is the opposite of what we do in native Python, which is
    # column-first, row-second.

    # Index-based selection: iloc
    # NOTE 1: iloc requires numeric indexers,
    # NOTE 2: iloc indexes exclusively
    # Select the first row of data in a DataFrame
    print_("First row of data", 0)
    print_(reviews.iloc[0])

    print_("Get the first column from a DataFrame", 0)
    print_(reviews.iloc[:, 0])

    print_("Get the first 3 rows from the country column", 0)
    print_(reviews.iloc[:3, 0])

    print_("Get the 2nd and 3rd rows from the country column", 0)
    print_(reviews.iloc[1:3, 0])

    print_("Get the first 3 rows from the country column using a list", 0)
    print_(reviews.iloc[[0, 1, 2], 0])

    print_("Get the 5 last elements from the dataset", 0)
    print_(reviews.iloc[-5:])

    # Label-based selection: loc
    # NOTE 1: loc works with string indexers,
    # NOTE 2: loc, meanwhile, indexes inclusively
    print_("Get the first entry in reviews (using loc)", 0)
    print_(reviews.loc[0, 'country'])

    print_("Get columns from the dataset using loc", 0)
    print_(reviews.loc[:, ['taster_name', 'taster_twitter_handle', 'points']])

    # ----------------------
    # Manipulating the index
    # ----------------------
    print_("set_index to the title field", 0)
    print_(reviews.set_index("title"))

    # ---------------------
    # Conditional selection
    # ---------------------
    print_("Check if each wine is Italian or not", 0)
    print_(reviews.country == 'Italy')

    print_("Get italian wined", 0)
    print_(reviews.loc[reviews.country == 'Italy'])

    # AND: &
    print_("Get italian wines that are better than average", 0)
    print_(reviews.loc[(reviews.country == 'Italy') & (reviews.points >= 90)])

    # OR : | (pipe)
    print_("Get italian or better than average wines", 0)
    print_(reviews.loc[(reviews.country == 'Italy') | (reviews.points >= 90)])

    # isin conditional selector
    print_("Get wines from Italy or France", 0)
    print_(reviews.loc[reviews.country.isin(['Italy', 'France'])])

    # isnull and notnull: is (or not) empty (NaN)
    print_("Get wines with a price tag", 0)
    print_(reviews.loc[reviews.price.notnull()])

    # --------------
    # Assigning data
    # --------------
    # Assign a constant value
    # Every row gets 'everyone'
    reviews['critic'] = 'everyone'
    print_("Assign a constant value", 0)
    print_(reviews['critic'])

    # Assign an iterable of values
    reviews['index_backwards'] = range(len(reviews), 0, -1)
    print_("Assign an iterable of values", 0)
    print_(reviews['index_backwards'])
예제 #25
0
def lesson_6():
    # pd.set_option('max_rows', 5)
    print_("Lesson 6: Renaming and Combining", 0, 1)
    reviews = pd.read_csv(wine_file_path, index_col=0)

    # --------
    # Renaming
    # --------
    # rename(): lets you change index names and/or column names

    # Change column
    # Change the points column in our dataset to score
    print_("Change the points column to score", 0)
    print_(reviews.rename(columns={'points': 'score'}))

    # Change indexes
    print_("Rename some elements of the index", 0)
    print_(reviews.rename(index={0: 'firstEntry', 1: 'secondEntry'}))

    # IMPORTANT: set_index() is usually more convenient than using rename()
    # to change indexes

    # rename_axis(): change the names for the row index and the column index
    print_("Change the row index to wines and the column index to fields", 0)
    print_(
        reviews.rename_axis("wines", axis='rows').rename_axis("fields",
                                                              axis='columns'))

    # ---------
    # Combining
    # ---------
    # Three core methods for combining DataFrames and Series (start less complex)
    # - concat()
    # - join()
    # - merge()
    #
    # NOTE: what merge() can do, join() can do it more simply

    # concat(): smush a given list of elements together along an axis
    #
    # Smush two datasets
    # Ref.: https://www.kaggle.com/datasnaek/youtube-new
    canadian_youtube = pd.read_csv(
        os.path.expanduser(
            "~/Data/kaggle_datasets/trending_youtube/CAvideos.csv"))
    british_youtube = pd.read_csv(
        os.path.expanduser(
            "~/Data/kaggle_datasets/trending_youtube/GBvideos.csv"))

    print_("Concat two datasets", 0)
    print_(pd.concat([canadian_youtube, british_youtube]))

    # join(): lets you combine different DataFrame objects which have an index
    # in common
    #
    # Pull down videos that happened to be trending on the same day in both
    # Canada and the UK
    print_(
        "videos that happened to be trending on the same day in both Canada "
        "and the UK", 0)
    left = canadian_youtube.set_index(['title', 'trending_date'])
    right = british_youtube.set_index(['title', 'trending_date'])

    print_(left.join(right, lsuffix='_CAN', rsuffix='_UK'))
예제 #26
0
def lesson_5():
    # pd.set_option('max_rows', 5)
    print_("Lesson 5: Data Types and Missing Values", 0, 1)
    reviews = pd.read_csv(wine_file_path, index_col=0)

    # ------
    # DTypes
    # ------
    # column.dtype
    print_("dtype of the price column", 0)
    print_(reviews.price.dtype)

    # DataFrame.dtypes: dtypes of every column
    print_("dtypes of every column", 0)
    print_(reviews.dtypes)

    # object type: for strings

    # astype(): converts a column of one type into another
    print_("Convert points from int64 t float64", 0)
    print_(reviews.points.astype('float64'))

    # ------------
    # Missing data
    # ------------
    # NaN values are always of the float64 dtype

    # Select NaN entries
    print_("Select NaN entries for country", 0)
    print_(reviews[pd.isnull(reviews.country)])

    # Replace missing values with fillna()
    print_("Replace missing values with Unknown", 0)
    print_(reviews.region_2.fillna("Unknown"))

    # Backfill strategy for filling missing values: fill each missing value
    # with the first non-null value that appears sometime after the given
    # record in the database.

    # Replace a non-null value: replace()
    print_("Replace @kerinokeefe to @kerino", 0)
    print_(reviews.taster_twitter_handle.replace("@kerinokeefe", "@kerino"))
예제 #27
0
def lesson_4():
    pd.set_option("display.max_rows", 5)
    print_("Lesson 4: Grouping and Sorting", 0, 1)
    reviews = pd.read_csv(wine_file_path, index_col=0)

    # ------------------
    # Groupwise analysis
    # ------------------
    print_("Count occurrences of each point using group_by()", 0)
    print_(reviews.groupby('points').points.count())

    # Equivalent to using value_counts()
    print_("Count occurrences of each point using value_counts()", 0)
    print_(reviews.points.value_counts().sort_index())

    # Get the cheapest wine in each point value category
    print_("Cheapest wine in each point value category", 0)
    print_(reviews.groupby('points').price.min())

    # Select the name of the first wine reviewed from each winery
    print_(
        "Select the name of the first wine reviewed from each winery using apply()",
        0)
    print_(reviews.groupby('winery').apply(lambda df: df.title.iloc[0]))

    # You can also group by more than one column
    # Example: pick out the best wine by country and province:
    print_("Pick out the best wine by country and province", 0)
    print_(
        reviews.groupby(['country', 'province'
                         ]).apply(lambda df: df.loc[df.points.idxmax()]))

    # agg(): lets you run a bunch of different functions on your DataFrame simultaneously
    # Example: generate a simple statistical summary of the dataset by country
    print_("Statistical summary by country", 0)
    print_(reviews.groupby(['country']).price.agg([len, min, max]))

    # -------------
    # Multi-indexes
    # -------------
    # vs single-level (regular) indices
    # More info about multi-indexes at https://pandas.pydata.org/pandas-docs/stable/advanced.html
    countries_reviewed = reviews.groupby(['country',
                                          'province']).description.agg([len])
    print_("Multi-index: country and province", 0)
    print_(countries_reviewed)

    # reset_index(): important multi-index method that converts back to a
    # regular index
    print_("reset_index(): get back to the original single index", 0)
    print_(countries_reviewed.reset_index())

    # -------
    # Sorting
    # -------
    countries_reviewed = countries_reviewed.reset_index()
    print_("Sort by 'len' (ascending)", 0)
    print_(countries_reviewed.sort_values(by='len'))

    print_("Sort by 'len' (descending)", 0)
    print_(countries_reviewed.sort_values(by='len', ascending=False))

    # Sort by index values
    print_("Sort by index values", 0)
    print_(countries_reviewed.sort_index())

    # Sort by more than one column at a time
    print_("Sort by 2 columns: country and len", 0)
    countries_reviewed.sort_values(by=['country', 'len'])
예제 #28
0
def lesson_1():
    print_("Lesson 1: Creating, Reading and Writing", 0, 1)
    # -------------
    # Creating data
    # -------------
    # DataFrame
    dt_int = pd.DataFrame({'Yes': [50, 21], 'No': [131, 2]})
    print_("Simple DataFrame with integers", 0)
    print_(dt_int)

    dt_str = pd.DataFrame({
        'Bob': ['I liked it.', 'It was awful.'],
        'Sue': ['Pretty good.', 'Bland.']
    })
    print_("Simple DataFrame with strings", 0)
    print_(dt_str)

    dt_index = pd.DataFrame(
        {
            'Bob': ['I liked it.', 'It was awful.'],
            'Sue': ['Pretty good.', 'Bland.']
        },
        index=['Product A', 'Product B'])
    print_("DataFrame with row labels", 0)
    print_(dt_index)

    # Series
    s_list = pd.Series([1, 2, 3, 4, 5])
    print_("Simple series with integers", 0)
    print_(s_list)

    # NOTE: a Series does not have a column name, it only has one overall name
    s_index_name = pd.Series([30, 35, 40],
                             index=['2015 Sales', '2016 Sales', '2017 Sales'],
                             name='Product A')
    print_("Series with row labels and a name", 0)
    print_(s_index_name)

    # ---------------
    # Read data files
    # ---------------
    wine_reviews = pd.read_csv(wine_file_path)
    print_("How large the Wine Reviews dataset is", 0)
    print("Shape: ", wine_reviews.shape)
    print("Number of entries: ", wine_reviews.shape[0] * wine_reviews.shape[1])
    print()

    print_("First 5 rows from the Wine Reviews dataset", 0)
    print_(wine_reviews.head())

    # Make pandas use the CSV's built-in index for the index (instead of
    # creating a new one from scratch) by specifying an index_col
    wine_reviews = pd.read_csv(wine_file_path, index_col=0)
    print_("First 5 rows from the Wine Reviews dataset [using index_col=0]")
    print_(wine_reviews.head())
예제 #29
0
def lesson_3():
    print_("Lesson 3: Summary Functions and Maps", 0, 1)
    reviews = pd.read_csv(wine_file_path, index_col=0)
    print_("Reviews", 0)
    print_(reviews)

    # -----------------
    # Summary functions
    # -----------------
    # Describe with numerical data
    print_("Describe reviews.points (numerical data only)", 0)
    print_(reviews.points.describe())

    # Describe with string data
    print_("Describe reviews.taster_name (string data)", 0)
    print_(reviews.taster_name.describe())

    # Statistic: mean
    print_("Mean of reviews.points", 0)
    print_(reviews.points.mean())

    # Unique values
    print_("Unique values from reviews.taster_name", 0)
    print_(reviews.taster_name.unique())

    # Unique values and how often they occur in the dataset
    print_("Unique values and their counts from reviews.taster_name", 0)
    print_(reviews.taster_name.value_counts())

    # ----
    # Maps
    # ----
    # Two important mapping methods: map() and apply()
    # NOTE: they don't modify the original data they're called on

    # map()
    # Remean the scores the wines received to 0
    review_points_mean = reviews.points.mean()
    remeans = reviews.points.map(lambda p: p - review_points_mean)
    print_("Remean the wine scores to 0 using map()", 0)
    print_(remeans)

    # apply()
    # NOTE: apply() is way slower than map()
    def remean_points(row):
        row.points = row.points - review_points_mean
        return row

    # NOTE: if axis='index', we transform each column
    # Commented because too slow
    """
    reviews_remeans = reviews.apply(remean_points, axis='columns')
    print_("Remean the wine scores to 0 using apply()", 0)
    print_(reviews_remeans.points)
    """

    # Faster way to remeaning the points column
    review_points_mean = reviews.points.mean()
    remeans = reviews.points - review_points_mean
    print_("Remean the wine scores to 0 using .mean() [Faster]", 0)
    print_(remeans)

    # Combining columns
    comb_cols = reviews.country + " - " + reviews.region_1
    print_("Combining country and region info", 0)
    print_(comb_cols)
예제 #30
0
def ex_2():
    print_("Exercise 2: Line Charts", 0, 1)

    # ---------------------
    # Step 1: Load the data
    # ---------------------
    museum_data = pd.read_csv(museum_filepath,
                              index_col="Date",
                              parse_dates=True)

    # -----------------------
    # Step 2: Review the data
    # -----------------------
    # Print the last five rows of the data
    print_("Last 5 rows", 0)
    print_(museum_data.tail())

    # How many visitors did the Chinese American Museum receive in July 2018?
    ca_museum_jul18 = museum_data.loc['2018-07-01', 'Chinese American Museum']
    print_(
        "Number of visitor the Chinese American Museum receive in July 2018",
        0)
    print_(ca_museum_jul18)

    # In October 2018, how many more visitors did Avila
    # Adobe receive than the Firehouse Museum?
    subset = museum_data.loc['2018-10-01', ['Avila Adobe', 'Firehouse Museum']]
    avila_oct18 = subset[0] - subset[1]
    print_(
        "Number of visitors Avila Adobe received more than the Firehouse Museum (October 2018)",
        0)
    print_(avila_oct18)

    # ---------------------------------
    # Step 3: Convince the museum board
    # ---------------------------------
    # Set the width and height of the figure
    plt.figure(figsize=(14, 6))

    # Add title
    plt.title("Monthly visitors for 4 museums in LA")

    # Line chart showing number of visitors to each museum over time
    sns.lineplot(data=museum_data)

    plt.show()

    # --------------------------
    # Step 4: Assess seasonality
    # --------------------------
    # Part A
    # Line plot showing the number of visitors to Avila Adobe over time
    # Set the width and height of the figure
    plt.figure(figsize=(14, 6))

    # Add title
    plt.title("Monthly visitors to Avila Adobe museum")

    # Line chart showing number of visitors to Avila Adobe over time
    sns.lineplot(data=museum_data['Avila Adobe'], label="Avila Adobe")

    # Add label for horizontal axis
    plt.xlabel("Date")

    plt.show()