def main(self):
     FE = FeatureEngineering()
     features_train, labels_train, features_test, labels_test, clean_data_frame = FE.main(
     )
     # RandomforestC=self.RFC(features_train, labels_train,features_test, labels_test,clean_data_frame)
     MultinomialNB = self.MNB(features_train, labels_train, features_test,
                              labels_test, clean_data_frame)
 def __init__(self, train, test, id_column, y_column_name):
     self.train = train
     self.number_of_train = train.shape[0]
     self.y_column_name = y_column_name
     self.id_column = id_column
     self.test = test
     self.data = pd.concat([train, test], ignore_index=True)
     self.feature_engineering = FeatureEngineering(train, test, id_column,
                                                   y_column_name)
     self.y = self.data[[self.id_column, self.y_column_name]]
Пример #3
0
 def __init__(self, train, test, id_column, y_column_name):
     self.train = train
     self.test = test
     self.y_column_name = y_column_name
     self.id_column = id_column
     self.data = pd.concat([train, test], ignore_index=True)
     self.feature_engineering = FeatureEngineering(train, test, id_column,
                                                   y_column_name)
     self.feature_selection = FeatureSelection(train, test, id_column,
                                               y_column_name)
Пример #4
0
    def fit_pipeline(self, input_csv_directory_path, input_csv_file_name):

        print('Start Testing pipeline')

        target_name = 'income'

        try:

            data_test = pd.read_csv(
                os.path.join(input_csv_directory_path, input_csv_file_name))

            print(data_test.head())
            print(data_test.shape)

            data_prepare = DataPrepare()
            df_clean = data_prepare.dataPrepare(data_test)

            feature_engineering = FeatureEngineering()
            df_features_target = feature_engineering.featureEngineering(
                df_clean)

            # Dropping missing values if any
            df_features_target.dropna(axis=0, inplace=True)

            model_pipeline = helper_models.load_pipeline()

            prediction = model_pipeline.predict(
                df_features_target.drop(columns=target_name))
            probability = model_pipeline.predict_proba(
                df_features_target.drop(columns=target_name))

            print(
                "Classification report: \n ",
                classification_report(df_features_target[target_name],
                                      prediction))

            helper_models.fill_confusion_matrix_and_save(
                df_features_target[target_name],
                prediction,
                f_name='Test Confusion matrix',
                out_dir=input_csv_directory_path)

            helper_models.plot_roc_curve_and_save(
                df_features_target[target_name],
                probability,
                f_name='Test Roc Curve',
                out_dir=input_csv_directory_path)

            print(
                'Pipeline completed successfully and results are stored in data directory'
            )

        except Exception as ex:
            print('Something went wrong with the Pipeline %s', ex)
            raise ex
Пример #5
0
    def __init__(self):

        fe = FeatureEngineering()
        fe.feature_age()
        fe.feature_days_admitted()
        fe.feature_total_medical_history()
        fe.feature_total_preop_medication()
        fe.feature_total_symptoms()
        fe.feature_lab_results_ratios()

        df_data = self.__drop_cols(fe.df_data)
        self.df_data = self.__get_dummy_vars(df_data)
Пример #6
0
 def create_reader_new(self):
     source_type = self._model_desc_obj.get("dataSource").get("source_type")
     parameters = self._model_desc_obj.get("dataSource").get("parameters")
     print('--------- create_reader parameters start ---------')
     for k in sorted(parameters.keys()):
         print(k, parameters[k])
     print('---------- create_reader parameters end ----------')
     if source_type == "kafka":
         reader = self.tensor_dict_from_kafka(parameters)
     elif source_type == "file":
         reader = self.tensor_dict_from_hdfs(parameters)
     fe = FeatureEngineering()
     reader = fe.get_tensor_dict(reader)
     reader.init(self.context)
     return reader
Пример #7
0
    def get_data_and_pipeline(self):
        """
        
        Function to obtain the pipeline object from the feature engineering class
        imported above among the libraries
                
        Args:
            None
        
        Returns:
            None
        
        """

        data = FeatureEngineering(self.data_path)
        X, full_pipeline = data.build_pipe(hash_size=100)
        self.data = X
        self.pipeline = full_pipeline
Пример #8
0
def main(data_directory_path):

    print("Model Process starts")

    start = time.time()

    data_read = DataRead(data_directory_path)

    data_prepare = DataPrepare()

    data_explore = DataExploration()

    feature_engineering = FeatureEngineering()

    modelling = Modelling()

    model_pipeline = ModelPipeline(data_read, data_explore, data_prepare,
                                   feature_engineering, modelling)

    model_pipeline.fit(data_directory_path)

    print("Model Process ends", time.time() - start, "s")
Пример #9
0
def main(data_directory_path, merge_csv_file_name, prepared_csv_file_name,
         features_target_csv_file_name):

    print("Model Process starts")

    #path = "E:\PlusDental_Task\sample_data"
    #merge_file_name = "data_merged.csv"
    #prepared_file_name = "data_prepared.csv"
    #feature_target_file_name = "features_target.csv"

    start = time.time()

    data_read_and_merge = DataReadAndMerge(data_directory_path,
                                           merge_csv_file_name)
    # data_read_and_merge.readAndMerge(path,merge_file_name)

    data_prepare = DataPrepare(data_directory_path, merge_csv_file_name)
    #data_prepare.dataPrepare(path, merge_file_name)

    #data_prepared = pd.read_csv(os.path.join(data_directory_path, prepared_csv_file_name))
    #print(data_prepared.head())
    #print(data_prepared.shape)

    #data_explore = DataExploration(data_prepared)
    #data_explore.dataExploration(data_prepared)

    feature_engineering = FeatureEngineering(data_directory_path,
                                             prepared_csv_file_name)
    #feature_engineering.featureEngineering(path,prepared_file_name)

    modelling = Modelling(data_directory_path, features_target_csv_file_name)
    #modelling.modelling(data_directory_path, features_target_csv_file_name)

    model_pipeline = ModelPipeline(data_read_and_merge, data_prepare,
                                   feature_engineering, modelling)
    model_pipeline.fit(data_directory_path, merge_csv_file_name,
                       prepared_csv_file_name, features_target_csv_file_name)

    print("Model Process ends", time.time() - start, "s")
Пример #10
0
def setUpDataFrame():
    train_filepath = read_yaml('baseConfig.yaml')
    df_raw = pd.read_csv(train_filepath, low_memory=False, parse_dates=['saledate'] )
    print('The shape of dataframe is %s' %(str(df_raw.shape)))
    cleaning = Cleaning()
    print('Converting sale price to log of sale price')
    df_raw = cleaning.convertFeatureToItsLog(df_raw, 'SalePrice')
    
    print("Turning string to categorical variables")
    df_raw = cleaning.turnStringToCategorical(df_raw)
    #Aligning the levels properly
    df_raw.UsageBand.cat.set_categories(['High', 'Medium', 'Low'], ordered=True, inplace=True)

    #converting date and time to features
    feat_eng = FeatureEngineering()
    feat_eng.convertDatesToFeatures(df_raw, 'saledate')
    #saving as feather
    try:
        os.makedirs('tmp', exist_ok=True)
        df_raw.to_feather('tmp/raw')
    except (FileNotFoundError, IOError) as e:
        print(e)
 def __init__(self, data, y_column_name):
     self.data = data
     self.y_column_name = y_column_name
     self.y = self.data[[self.y_column_name]]
     self.feature_engineering = FeatureEngineering(data, y_column_name)
 def __init__(self, data, y_column_name):
     self.data = data
     self.y_column_name = y_column_name
     self.feature_engineering = FeatureEngineering(data, y_column_name)
     self.feature_selection = FeatureSelection(data, y_column_name)
import mlflow.tensorflow

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Sequential
from tensorflow.keras import layers

from data_processing import DataProcessing
from feature_engineering import FeatureEngineering
from classifier import QuestionAnswerClassifer

if __name__ == '__main__':

    Feature_Engineering = FeatureEngineering(new_df, 'Question', 'Label',
                                             'Sentence')

    maxlen = Feature_Engineering.determine_maxlen() + 10
    print(f'Maxlen:{maxlen}')

    #Split the dataset into train and test set
    new_df['Question_Sentence'] = new_df['Question'] + ' ' + new_df['Sentence']
    features = new_df['Question_Sentence']
    target = new_df['Label']
    x_train, x_test, y_train, y_test = train_test_split(features,
                                                        target,
                                                        test_size=0.25,
                                                        random_state=0)

    tokenizer = Feature_Engineering.text_tokenize(x_train.values,
                                                  num_words=5000)
from feature_engineering import FeatureEngineering

f = FeatureEngineering('ex_train.csv', 'ex_test.csv', 'key',
                       ['ex_wide_1.csv', 'ex_wide_2.csv'], 'ex_long.csv')
d = f.extract_features()
Пример #15
0
    print("Turning string to categorical variables")
    df_raw = cleaning.turnStringToCategorical(df_raw)
    #Aligning the levels properly
    df_raw.UsageBand.cat.set_categories(['High', 'Medium', 'Low'], ordered=True, inplace=True)

    #converting date and time to features
    feat_eng = FeatureEngineering()
    feat_eng.convertDatesToFeatures(df_raw, 'saledate')
    #saving as feather
    try:
        os.makedirs('tmp', exist_ok=True)
        df_raw.to_feather('tmp/raw')
    except (FileNotFoundError, IOError) as e:
        print(e)

feat_eng = FeatureEngineering()
print(feat_eng.testIfDateTimeWorks());


base_config = read_yaml('baseConfig.yaml')

#Reading files
try:
    df_raw = pd.read_feather(base_config.parameters.bulldozer_train_feather)
    print('Finished reading feather file')
    if 'saleYear' in df_raw.columns:
        print('Features from dates are present in this feather file')
except (IOError, OSError) as e:
    print('Feather file does not exist')
    print(e)
    print('Doing the initial setup')