示例#1
0
def page():
    x = request.args['x']
    y = request.args['y']
    model = Model()
    model.train()
    n  = model.predict([int(x),int(y)])
    return jsonify({ 'data' : str(n)})
示例#2
0
    def __init__(self, user_id):
        print(f"Создан объект бота для пользователя {user_id}!")

        self._USER_ID = user_id
        self._USERNAME = self._get_user_name_from_vk_id(user_id)

        self._COMMANDS = [
            "ПРИВЕТ", "ПОГОДА", "ВРЕМЯ", "ПОКА", "КОНСУЛЬТАЦИЯ", "НАЧАТЬ"
        ]
        self._REGISTED_ID = []
        self.model = Model()
示例#3
0
 def test_model_loading(self):
     model_file = get_testfile("example_model.pkl")
     model_source = get_model_source("fs", model_file)
     mod = Model(model_source)
     assert hasattr(mod, '_model')
     assert hasattr(mod, 'target')
     assert mod._model is not None
     assert mod.target is not None
示例#4
0
 def test_s3_to_fs_copy(self):
     model_file = "s3://netlyt.com/example_model.pkl"
     model_file_dest = get_testfile("example_model_tmp.pkl")
     model_source = get_model_source("s3", model_file, True)
     model_dest = get_model_source("fs", model_file_dest)
     # Assert that we can copy it and open it correctly
     assert model_source.copy_to(model_dest)
     copied_model = Model(model_dest)
     assert model_dest.delete()
示例#5
0
    def test_simple_linear_dataset_scaled_cv(self):
        model = Model.create_model(
            model_type=Model.MODEL_TYPE_SVR,
            cross_validation=True,
            feature_scaling=True,
            C_range=[0.001, 0.01, 0.1, 1, 10, 100],
            kernel=Model.KERNEL_LINEAR
        )
        train_dataset, test_dataset = test_datasets.get_simple_linear_datasets()

        self._test_dataset(model, train_dataset, test_dataset, 0, title="SVR scaled CV on linear dataset")
示例#6
0
    def test_simple_poly_dataset_scaled_cv(self):
        model = Model.create_model(
            model_type=Model.MODEL_TYPE_SVR,
            cross_validation=True,
            feature_scaling=True,
            C_range=[0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10],
            kernel=Model.KERNEL_POLYNOMIAL
        )
        train_dataset, test_dataset = test_datasets.get_simple_polynomial_datasets(n=1000)

        self._test_dataset(model, train_dataset, test_dataset, 0, title="SVR with polynomial kernel, scaled CV on poly dataset")
示例#7
0
    def test_simple_linear_dataset_scaled(self):
        model = Model.create_model(
            model_type=Model.MODEL_TYPE_SVR,
            cross_validation=False,
            feature_scaling=True,
            C=1,
            kernel=Model.KERNEL_LINEAR
        )
        train_dataset, test_dataset = test_datasets.get_simple_linear_datasets()

        self._test_dataset(model, train_dataset, test_dataset, 0, title="SVR scaled on linear dataset")
示例#8
0
	def main(self):
		
		# Step 1 --> Get and sample data from the original dataset
		print("[AVAZU]\tCreating Data Handler for train and test files...")
		if self.validation_file == None:
			data = DataHandler(self.train_file,self.train_file)
		else:
			data = DataHandler(self.train_file,self.validation_file)

		
 		# you can then get train and test files by doing data._train or data._test
		data.create_train_and_test(self.sampling)
		print("[AVAZU]\tData read and split successfully.")


 		# Step 2 --> Feature Engineering
		feat_eng = FeatureEngineering()
		if need_feateng == True:
			print("[AVAZU]\tStarting feature engineering operations...")
			#data.transform_data(feat_eng.append_hours,feat_eng.append_counters_uniques)
			#data.transform_data(feat_eng.append_days,feat_eng.)
			print("[AVAZU]\tFinished feature engineering operations.")
		
		#print("[AVAZU]\tSaving intermediary dataframe...")
		#data.save("feat_eng_")
		#print("[AVAZU]\tFile saved...")		

		data._train.drop(['click','id'], axis=1, inplace=True)
		data.drop(['C1','device_conn_type','device_type','banner_pos','C15','C16','C17','C18','C19','C20','C21','device_model','site_category','site_domain','site_id','app_category','app_domain','app_id','hours','device_id','device_ip'])
		
		print("[AVAZU]\tSelecting K-Best Features...")
		idxs = feat_eng.select_best_k_features(data._train,data._ytrain,5)
		print(idxs)
		print("[AVAZU]\tFeatures selected.")
 		
 		# Step 3 --> Train model and test it
		print("Training and testing the model...")
		m = Model(idxs)
		m.train(data)
		m.predict(data)

		if self.validation_file != None:
			print(type(data._test['id'].ravel()))
			df = pd.DataFrame({'id':data._test['id'].ravel(),'click':m._ypred}, columns=['id', 'click'])
			df.to_csv('output.csv', index=False)
			print("Output in the file: output.csv")
		else:
			print("Log Loss Result: " + str(m.log_loss(data)))

		def print_help():
			print("python avazu.py <args>\n\n")
			print("------------------------------------------------------------------")
			print("List of arguments:")
			print("\t -t: Specifies a training file to read from a csv file\t[REQUIRED]")
			print("\t -v: Specifies a validation/test file to read from a csv file. If the file is not specified, 30\% of the training samples will be used to build the validation dataset (Default: Not specified)")
			print("\t -s: Specifies a sampling ratio to be performed over the training data. If none is specified, it will use the whole dataset. (Default: Entire dataset)")
			print("\t -f: (To be inproved) Specifies if one desires to apply feature engineering over the dataset or not. (Default: No transformation applied)")
			print("\t -h: Used to print this menu")
			print("------------------------------------------------------------------")
示例#9
0
    def test_fs_to_s3_copy_authed(self):
        import settings
        import boto3
        import os
        model_file_dest = "s3://data.ml.netlyt.com/example_model.pkl"
        model_file = get_testfile("example_model.pkl")

        model_source = get_model_source("fs", model_file)
        model_dest = get_model_source("s3", model_file_dest)
        # Assert that we can copy it and open it correctly
        assert model_source.copy_to(model_dest)
        copied_model = Model(model_dest)
        assert model_dest.delete()
示例#10
0
    def test_simple_poly_dataset_scaled_cv(self):
        model = Model.create_model(
            model_type=Model.MODEL_TYPE_SVR,
            cross_validation=True,
            feature_scaling=True,
            C_range=[0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10],
            kernel=Model.KERNEL_RBF
        )
        train_dataset, test_dataset = test_datasets.get_simple_polynomial_datasets(n=1000)

        scaler = StandardScaler()
        scaler.fit(train_dataset.data)
        print("Train mean: " + str(scaler.transform(train_dataset.data).mean(axis=0)))
        print("Test mean: " + str( scaler.transform(test_dataset.data).mean(axis=0)))
        print("Train std: " + str(scaler.transform(train_dataset.data).std(axis=0)))
        print("Test str: " + str( scaler.transform(test_dataset.data).std(axis=0)))

        self._test_dataset(model, train_dataset, test_dataset, 0, title="SVR with RBF kernel, scaled CV on poly dataset")
示例#11
0
def plot_validation_curve(model_type,
                          train_dataset,
                          feature_scaling,
                          polynomial_degree,
                          kernel,
                          svr_degree,
                          svr_epsilon,
                          svr_gamma,
                          svr_coef0,
                          sparse,
                          score_attr=None,
                          cv=5,
                          alpha=None,
                          C=None,
                          n_jobs=-1,
                          save=False,
                          display=True,
                          filename="validation_curve"):
    if not save and not display:
        return

    estimator = Model.create_model(model_type,
                                   feature_scaling=feature_scaling,
                                   polynomial_degree=polynomial_degree,
                                   cross_validation=False,
                                   kernel=kernel,
                                   svr_degree=svr_degree,
                                   svr_epsilon=svr_epsilon,
                                   svr_gamma=svr_gamma,
                                   svr_coef0=svr_coef0,
                                   sparse=sparse)

    model_type = model_type.upper()
    if model_type == Model.MODEL_TYPE_RIDREG:
        if alpha is None or type(alpha) != list or len(alpha) == 0:
            logging.warning(
                "Validation curve cannot be drawn for %s when no alpha range is specified."
                % model_type)
            return
        param_name = "RIDGE_REGRESSION__alpha"
        param_range = sorted(alpha)
    elif model_type == Model.MODEL_TYPE_SVR:
        if C is None or type(C) != list or len(C) == 0:
            logging.warning(
                "Validation curve cannot be drawn for %s when no C range is specified."
                % model_type)
            return
        param_name = "SVR__C"
        param_range = sorted(C)
    else:
        logging.warning(
            "Validation curve is not applicable to Model type %s." %
            model_type)
        return

    logging.info("Calculating validation curve")
    train_scores, valid_scores = validation_curve(estimator=estimator,
                                                  X=train_dataset.data,
                                                  y=train_dataset.target,
                                                  param_name=param_name,
                                                  param_range=param_range,
                                                  cv=cv,
                                                  scoring=score_attr,
                                                  n_jobs=n_jobs)

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    valid_scores_mean = np.mean(valid_scores, axis=1)
    valid_scores_std = np.std(valid_scores, axis=1)

    logging.debug("Plotting validation curve")
    plt.title("Validation curve")
    plt.xlabel(param_name)
    plt.ylabel(score_attr.upper() if score_attr else "" + "Score")
    plt.semilogx(param_range,
                 train_scores_mean,
                 'o-',
                 label="Training score",
                 color="r")
    plt.fill_between(param_range,
                     train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std,
                     alpha=0.2,
                     color="r")
    plt.semilogx(param_range,
                 valid_scores_mean,
                 'o-',
                 label="Cross-Validation score",
                 color="g")
    plt.fill_between(param_range,
                     valid_scores_mean - valid_scores_std,
                     valid_scores_mean + valid_scores_std,
                     alpha=0.2,
                     color="g")
    plt.legend(loc="best")

    if display:
        plt.show()
    if save:
        plt.savefig(filename + "_" +
                    datetime.now().strftime("%Y_%m_%d_%H_%M") + ".png",
                    dpi=400)
    plt.clf()
示例#12
0
# -*- coding: utf-8 -*-

import telebot
import config as cfg
import cherrypy
import requests
import shutil
import os
from ml import Model
from threading import Timer

bot = telebot.TeleBot(cfg.token)
model = Model()

users = {}

rybkin_id = 122358697

env_var = {
    'last_theme': None,
    'get_response': None,
    'expected': 'query',
    'timer': None,
    'timer_desc': '',
    'try_count': 0,
    'context': None
}

'''
class WebhookServer(object):
    @cherrypy.expose
示例#13
0
class VkBot:
    def __init__(self, user_id):
        print(f"Создан объект бота для пользователя {user_id}!")

        self._USER_ID = user_id
        self._USERNAME = self._get_user_name_from_vk_id(user_id)

        self._COMMANDS = [
            "ПРИВЕТ", "ПОГОДА", "ВРЕМЯ", "ПОКА", "КОНСУЛЬТАЦИЯ", "НАЧАТЬ"
        ]
        self._REGISTED_ID = []
        self.model = Model()

    def _get_user_name_from_vk_id(self, user_id):
        request = requests.get("https://vk.com/id" + str(user_id))
        bs = bs4.BeautifulSoup(request.text, "html.parser")

        user_name = self._clean_all_tag_from_str(bs.findAll("title")[0])

        return user_name.split()[0]

    def new_message(self, message):

        # Привет
        if message.upper() == self._COMMANDS[0] or message.upper(
        ) == self._COMMANDS[5]:
            if self._USER_ID not in self._REGISTED_ID:
                self._REGISTED_ID.append(self._USER_ID)
            return f"Привет-привет, {self._USERNAME}!"

        # Погода
        elif message.upper() == self._COMMANDS[1]:
            return self._get_weather()

        # Время
        elif message.upper() == self._COMMANDS[2]:
            return self._get_time()

        # Пока
        elif message.upper() == self._COMMANDS[3]:
            return f"Пока-пока, {self._USERNAME}!"

        # Консультация
        elif message.upper() == self._COMMANDS[4]:
            return f"Готов к консультации, {self._USERNAME}!"

        else:
            try:
                res1 = "Похожий товар: " + \
                       str(self.model.most_similar_id(int(message))) + "\n"
            except Exception as e:
                res1 = "Похожий товар не найден " + str(e) + "\n"
            try:
                res2 = "Дополнительный товар: " + \
                       str(self.model.predict_output_id(int(message)))
            except Exception as e:
                res2 = "Дополнительный товар не найден " + str(e) + "\n"
            return res1 + res2

    def _get_time(self):
        request = requests.get("https://my-calend.ru/date-and-time-today")
        b = bs4.BeautifulSoup(request.text, "html.parser")
        return self._clean_all_tag_from_str(
            str(b.select(".page")[0].findAll("h2")[1])).split()[1]

    @staticmethod
    def _clean_all_tag_from_str(string_line):
        """
        Очистка строки stringLine от тэгов и их содержимых
        :param string_line: Очищаемая строка
        :return: очищенная строка
        """

        result = ""
        not_skip = True
        for i in list(string_line):
            if not_skip:
                if i == "<":
                    not_skip = False
                else:
                    result += i
            else:
                if i == ">":
                    not_skip = True

        return result

    @staticmethod
    def _get_weather(city: str = "москва") -> list:

        request = requests.get("https://sinoptik.com.ru/погода-" + city)
        b = bs4.BeautifulSoup(request.text, "html.parser")

        p3 = b.select('.temperature .p3')
        weather1 = p3[0].getText()
        p4 = b.select('.temperature .p4')
        weather2 = p4[0].getText()
        p5 = b.select('.temperature .p5')
        weather3 = p5[0].getText()
        p6 = b.select('.temperature .p6')
        weather4 = p6[0].getText()

        result = ''
        result = result + ('Утром :' + weather1 + ' ' + weather2) + '\n'
        result = result + ('Днём :' + weather3 + ' ' + weather4) + '\n'
        temp = b.select('.rSide .description')
        weather = temp[0].getText()
        result = result + weather.strip()

        return result

    @staticmethod
    def ml_func():
        return
示例#14
0
from flask import Flask, render_template, request

app = Flask(__name__)

from ml import Model

model = Model('15_june_2020_v1')


@app.route('/')
def home():
    query = request.args.get('query')
    if query:
        res = resML(query)
        return render_template("result.html", res=res, query=query)
    return render_template("home.html")


def resML(query):
    """
    This takes in the query from the user and return top 5 words and meaning
    INPUT: string query
    OUTPUT: list 5 tuples [('word', 'meaning')]
    """
    words = model.get_words(query)
    result = model.get_meanings(words)
    return result


@app.route('/chess')
def chess():
示例#15
0
root.setLevel(logging.INFO)

handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)

formatter = logging.Formatter(
    '%(asctime)s - %(name)s - %(levelname)s - %(message)s')

handler.setFormatter(formatter)
root.addHandler(handler)

# ML MODEL

from ml import Model

model = Model('/home/vbermudez/test/python_ai_test/csv/data.csv',
              '/home/vbermudez/test/python_ai_test/model/lstm_model.h5')

if not model.exists: model.train()
else: model.prepare_model()

row_count, tip_avg = model.get_stats()

logging.info(f'LINEAS={row_count}')
logging.info(f'AVERAGE tip_amout={tip_avg}')

values = [[1.7, 1.5], [6, 0.5], [1, 0]]

logging.info(f'Predictions:\n{model.predict( values )}')

# In memory MongoDB (MontyDB)
示例#16
0
def main():
    cli_args = parse_arguments()
    try:
        Config.read_config(cli_args.config_file)
    except ConfigError:
        die("Config File %s could not be read correctly! " %
            cli_args.config_file)
    init_logging()
    logging.info("Starting ML Pipeline!")
    logging.info("Initializing Database")
    try:
        DB.init_db()
    except DBError:
        die("DB Model could not be created!")

    logging.info("Reading training dataset")
    train_dataset = Dataset.get_dataset(
        Config.repository_name,
        Config.dataset_train_start,
        Config.dataset_train_end,
        Config.dataset_features,
        Config.dataset_target,
        ngram_sizes=Config.dataset_ngram_sizes,
        ngram_levels=Config.dataset_ngram_levels,
        label="Training",
        cache=Config.dataset_cache,
        eager_load=Config.database_eager_load,
        sparse=Config.dataset_sparse)
    if train_dataset is None:
        die("Training Dataset could not be created!")
    if Config.ml_log_transform_target:
        train_dataset.target = LogTransform.log_transform(
            train_dataset.target, base=Config.ml_log_transform_base)

    logging.info("Reading test dataset")
    test_dataset = Dataset.get_dataset(
        Config.repository_name,
        Config.dataset_test_start,
        Config.dataset_test_end,
        Config.dataset_features,
        Config.dataset_target,
        ngram_sizes=Config.dataset_ngram_sizes,
        ngram_levels=Config.dataset_ngram_levels,
        label="Test",
        cache=Config.dataset_cache,
        eager_load=Config.database_eager_load,
        sparse=Config.dataset_sparse)
    if test_dataset is None:
        die("Test Dataset could not be created!")
    if Config.ml_log_transform_target:
        test_dataset.target = LogTransform.log_transform(
            test_dataset.target, base=Config.ml_log_transform_base)

    logging.info("Creating and training model with training dataset")
    model = Model.create_model(Config.ml_model,
                               feature_scaling=Config.ml_feature_scaling,
                               polynomial_degree=Config.ml_polynomial_degree,
                               cross_validation=Config.ml_cross_validation,
                               alpha=Config.ml_alpha,
                               C=Config.ml_C,
                               kernel=Config.ml_kernel,
                               svr_degree=Config.ml_svr_degree,
                               svr_epsilon=Config.ml_svr_epsilon,
                               svr_gamma=Config.ml_svr_gamma,
                               svr_coef0=Config.ml_svr_coef0,
                               sparse=Config.dataset_sparse)

    Model.train_model(model, train_dataset)

    logging.info("Model successfully trained.")

    logging.debug("Creating predictions...")
    baseline_mean_prediction = Predict.predict_mean(
        train_dataset, test_dataset.target.shape[0])
    baseline_med_prediction = Predict.predict_median(
        train_dataset, test_dataset.target.shape[0])
    baseline_wr_prediction = Predict.predict_weighted_random(
        train_dataset, test_dataset.target.shape[0])
    training_prediction = Predict.predict_with_model(train_dataset, model)
    test_prediction = Predict.predict_with_model(test_dataset, model)

    logging.debug("Creating reports from predictions")

    train_target = train_dataset.target
    test_target = test_dataset.target
    if Config.ml_log_transform_target:
        train_target = LogTransform.exp_transform(train_target,
                                                  Config.ml_log_transform_base)
        training_prediction = LogTransform.exp_transform(
            training_prediction, Config.ml_log_transform_base)
        test_target = LogTransform.exp_transform(test_target,
                                                 Config.ml_log_transform_base)
        test_prediction = LogTransform.exp_transform(
            test_prediction, Config.ml_log_transform_base)
        baseline_mean_prediction = LogTransform.exp_transform(
            baseline_mean_prediction, Config.ml_log_transform_base)
        baseline_med_prediction = LogTransform.exp_transform(
            baseline_med_prediction, Config.ml_log_transform_base)
        baseline_wr_prediction = LogTransform.exp_transform(
            baseline_wr_prediction, Config.ml_log_transform_base)

    baseline_mean_report = Reporting.Report(test_target,
                                            baseline_mean_prediction,
                                            "Mean Baseline")
    baseline_med_report = Reporting.Report(test_target,
                                           baseline_med_prediction,
                                           "Median Baseline")
    baseline_wr_report = Reporting.Report(test_target, baseline_wr_prediction,
                                          "Weighted Random Baseline")
    training_report = Reporting.Report(train_target, training_prediction,
                                       "Training")
    test_report = Reporting.Report(test_target, test_prediction, "Test")

    base_entry = Scoreboard.create_entry_from_config(baseline_wr_report)
    test_entry = Scoreboard.create_entry_from_config(test_report)
    Scoreboard.add_entry(base_entry)
    Scoreboard.add_entry(test_entry)
    Scoreboard.write_entries()
    base_ranking = Scoreboard.get_ranking(base_entry,
                                          Scoreboard.RATING_ATTRIBUTE_R2S)
    test_ranking = Scoreboard.get_ranking(test_entry,
                                          Scoreboard.RATING_ATTRIBUTE_R2S)

    if Config.reporting_display or Config.reporting_save:
        config_table = Reporting.get_config_table()
        add_to_report(config_table.table)

        add_to_report(baseline_mean_report)
        add_to_report(baseline_med_report)
        add_to_report(baseline_wr_report)
        add_to_report(training_report)
        add_to_report(test_report)

        comparisation_table = Reporting.get_report_comparisation_table(
            [baseline_wr_report, training_report, test_report],
            [Reporting.SCORE_R2S, Reporting.SCORE_MAE, Reporting.SCORE_MDE])
        add_to_report(comparisation_table.table)

        category_table = Reporting.get_category_table(
            train_target, training_prediction, label="Training prediction")
        add_to_report(category_table.table)

        category_table = Reporting.get_category_table(test_target,
                                                      test_prediction,
                                                      label="Test prediction")
        add_to_report(category_table.table)

        confusion_matrix_table, classification_report = Reporting.get_confusion_matrix(
            train_target, training_prediction, label="Training prediction")
        add_to_report(confusion_matrix_table.table)
        add_to_report(classification_report)
        confusion_matrix_table, classification_report = Reporting.get_confusion_matrix(
            test_target, test_prediction, label="Test prediction")
        add_to_report(confusion_matrix_table.table)
        add_to_report(classification_report)

        if Config.ml_polynomial_degree == 1:
            # Determining top features only makes sense without polynomial features.
            top_features_table = Reporting.get_top_features_table(
                model, train_dataset.feature_list, 10)
            if top_features_table is not None:
                add_to_report(top_features_table.table)

        add_to_report("Base ranking: %i" % base_ranking)
        add_to_report("Test ranking: %i" % test_ranking)
        if test_ranking == 0:
            add_to_report("Congratulations! Best one so far!")
        elif base_ranking > test_ranking:
            add_to_report("Hey, at least better than the baseline!")
        else:
            add_to_report("Do you even learn?")

        if Config.reporting_display:
            print(report_str)

        if Config.reporting_save:
            Reporting.save_report_file(report_str,
                                       filename=Config.reporting_file)

        if Config.reporting_target_histogram:
            Reporting.plot_target_histogram(
                train_dataset,
                display=Config.reporting_display_charts,
                save=Config.reporting_save_charts,
            )

        if Config.reporting_validation_curve and Config.ml_cross_validation:
            Reporting.plot_validation_curve(
                model_type=Config.ml_model,
                train_dataset=train_dataset,
                alpha=Config.ml_alpha,
                C=Config.ml_C,
                feature_scaling=Config.ml_feature_scaling,
                polynomial_degree=Config.ml_polynomial_degree,
                kernel=Config.ml_kernel,
                svr_degree=Config.ml_svr_degree,
                svr_epsilon=Config.ml_svr_epsilon,
                svr_gamma=Config.ml_svr_gamma,
                svr_coef0=Config.ml_svr_coef0,
                sparse=Config.dataset_sparse,
                display=Config.reporting_display_charts,
                save=Config.reporting_save_charts)

        if Config.reporting_learning_curve:
            Reporting.plot_learning_curve(
                train_dataset=train_dataset,
                estimator=model,
                display=Config.reporting_display_charts,
                save=Config.reporting_save_charts)

        if Config.reporting_confusion_matrix_chart:
            Reporting.plot_confusion_matrix(
                ground_truth=train_target,
                predicted=training_prediction,
                label="Training",
                display=Config.reporting_display_charts,
                save=Config.reporting_save_charts)
            Reporting.plot_confusion_matrix(
                ground_truth=test_target,
                predicted=test_prediction,
                label="Test",
                display=Config.reporting_display_charts,
                save=Config.reporting_save_charts)

    logging.info("All done. Exiting ML Pipeline")
    def main(self):

        # Step 1 --> Get and sample data from the original dataset
        print("[AVAZU]\tCreating Data Handler for train and test files...")
        if self.validation_file == None:
            data = DataHandler(self.train_file, self.train_file)
        else:
            data = DataHandler(self.train_file, self.validation_file)

# you can then get train and test files by doing data._train or data._test
        data.create_train_and_test(self.sampling)
        print("[AVAZU]\tData read and split successfully.")

        # Step 2 --> Feature Engineering
        feat_eng = FeatureEngineering()
        if need_feateng == True:
            print("[AVAZU]\tStarting feature engineering operations...")
            #data.transform_data(feat_eng.append_hours,feat_eng.append_counters_uniques)
            #data.transform_data(feat_eng.append_days,feat_eng.)
            print("[AVAZU]\tFinished feature engineering operations.")

        #print("[AVAZU]\tSaving intermediary dataframe...")
        #data.save("feat_eng_")
        #print("[AVAZU]\tFile saved...")

        data._train.drop(['click', 'id'], axis=1, inplace=True)
        data.drop([
            'C1', 'device_conn_type', 'device_type', 'banner_pos', 'C15',
            'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'device_model',
            'site_category', 'site_domain', 'site_id', 'app_category',
            'app_domain', 'app_id', 'hours', 'device_id', 'device_ip'
        ])

        print("[AVAZU]\tSelecting K-Best Features...")
        idxs = feat_eng.select_best_k_features(data._train, data._ytrain, 5)
        print(idxs)
        print("[AVAZU]\tFeatures selected.")

        # Step 3 --> Train model and test it
        print("Training and testing the model...")
        m = Model(idxs)
        m.train(data)
        m.predict(data)

        if self.validation_file != None:
            print(type(data._test['id'].ravel()))
            df = pd.DataFrame(
                {
                    'id': data._test['id'].ravel(),
                    'click': m._ypred
                },
                columns=['id', 'click'])
            df.to_csv('output.csv', index=False)
            print("Output in the file: output.csv")
        else:
            print("Log Loss Result: " + str(m.log_loss(data)))

        def print_help():
            print("python avazu.py <args>\n\n")
            print(
                "------------------------------------------------------------------"
            )
            print("List of arguments:")
            print(
                "\t -t: Specifies a training file to read from a csv file\t[REQUIRED]"
            )
            print(
                "\t -v: Specifies a validation/test file to read from a csv file. If the file is not specified, 30\% of the training samples will be used to build the validation dataset (Default: Not specified)"
            )
            print(
                "\t -s: Specifies a sampling ratio to be performed over the training data. If none is specified, it will use the whole dataset. (Default: Entire dataset)"
            )
            print(
                "\t -f: (To be inproved) Specifies if one desires to apply feature engineering over the dataset or not. (Default: No transformation applied)"
            )
            print("\t -h: Used to print this menu")
            print(
                "------------------------------------------------------------------"
            )
示例#18
0
def teste():
    logicals  = request.json['logicals']
    model = Model()
    model.train()
    n  = model.predict(logicals)
    return  jsonify({ 'data' : str(n) })