def __init__(self, config_path, model_name): model_props = load_properties(config_path, model_name) self.non_categorical_features = model_props['non_categorical_features'] self.target = model_props['target'] self.test_size = float(model_props['test_size']) self.model_output_filepath = model_props['model_output_filepath'] self.model_result_filepath = model_props['model_result_filepath'] self.features_filename = model_props['features_filename'] self.Features = Features(config_path, model_name) self.subscr_type = model_props['subscr_type'] self.drop_columns = model_props['drop_columns'] self.drop_rows = model_props['drop_rows']
def test_model_function(self): tests = Features().get_tests().keys() model1 = Model(tests) model1.fit(x, y) fm = FinkMos(x, x, model1.tests, model1.tag_corpus) a = model1.model_function(1, 3, [2, 3], fm) print("model function result") print(a)
def __init__(self, x, y, tag_corpus): assert isinstance(x, pd.Series) self.tag_corpus = tag_corpus self.test_dict = Features().get_tests() self.test_vec = np.array([test['func'][1] for test in self.test_dict.values()]) self.x = x self.y = y self.f_matrix_list = None # self.linear_loss_done = None # self.word2number = {word: index for index, word in enumerate(x.value_counts().index)} # tc = tag_corpus.shape[0] self.fast_test = dict() self.fast_predict = dict() self.weight_mat = None self.tuple_5_list = None self.tup5_2index = dict() self.opt = None self.v = None self.f_v_train = None self.calc_from_mem = None
def api(img=None): response = { 'status': False, 'msg': 'Unexpected argument. No image specified', 'data': None } if img: img = request.json('img') if upload_file(img, img.filename, app.config['UPLOAD_FOLDER']): # preprocess image features = Features(data_dir=config.DATASET_PATH) img_class = predict(img.filename) # Response response['status'] = True response['msg'] = 'Upload sucessful.' response['data'] = {'img': img, 'img_class': img_class} else: response['msg'] = 'Could not upload image' return jsonify(response)
def test_predict(self): # tests = pass # Load Data data = PreprocessTags(True).load_data(r'..\data\train.wtag') word_num = 30 x = data.x[0:word_num] y = data.y[0:word_num] # generate tests - (comment out if file is updated) feat_generator = Features() feat_generator.generate_tuple_corpus(x, y) for template in feat.templates_dict.values(): feat_generator.generate_lambdas(template['func'], template['tuples']) feat_generator.save_tests() model1 = Model() a = model1.fit(x, y) x_test = x y_hat = model1.predict(x_test) print(y_hat) cm = model1.confusion(y_hat=y_hat, y=y) cm.to_csv(r'../training/confusion_matrix.csv')
def test_create_tuples(self): data = PreprocessTags(True).load_data(r'..\data\train.wtag') word_num = 1_000 tag_corp = pd.Series(data.y[0:word_num]).unique() # generate tests - (comment out if file is updated) feat_generator = Features() feat_generator.generate_tuple_corpus(data.x[0:word_num], data.y[0:word_num]) for template in feat.templates_dict.values(): feat_generator.generate_lambdas(template['func'], template['tuples']) feat_generator.save_tests() fm = FinkMos(data.x[0:word_num], data.y[0:word_num], tag_corp) fm.create_tuples() print("fm.weight_mat") print(fm.weight_mat) print("fm.tuple_5_list") print(fm.tuple_5_list) fm.create_feature_sparse_list_v2() # print(len(fm.f_matrix_list)) print(fm.f_matrix_list[0].shape) fm.minimize_loss() fm.v.dump('values')
def test_feature_generator(self): data = PreprocessTags(True).load_data( r'..\data\toy_dataset.txt') feat_generator = Features() feat_generator.generate_tuple_corpus(data.x[0:10000], data.y[0:10000]) try: # feat_generator.get_tests() # loads last version saved pass except: pass for template in feat.templates_dict.values(): feat_generator.generate_lambdas(template['func'], template['tuples']) # feat_generator.add_lambdas(feat.suffix_funcs_all) # DONE # feat_generator.add_lambdas(feat.prefix_funcs_all) # DONE result = feat_generator.lambdas print(len(result)) with open(fr"../training/report_lambdas_dict.p", 'wb') as stream: pickle.dump(result, stream)
# Set up logger logger = setup_logger() logger.info(args) # Load Dataset & Batch Loader logger.info("Loading the dataset ...") dataset = TrainingDataset(args.dataset) train_loader = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) # Load Model logger.info("Loading the model ...") model = Features() # Triplets Loss logger.info("Loading the triplets loss function ...") triplets_loss = TripletsLoss() # Enable GPU if use_gpu: model = model.cuda() triplets_loss = triplets_loss.cuda() # Set model in training mode logger.info("Setting up training mode ...") model.train() # Adam Optimizer
import unittest import numpy as np import pandas as pd from models.model import Model import models.features as feat from models.features import Features from models.prerocesing import PreprocessTags from models.sentence_processor import FinkMos import os os.chdir(r'C:\Users\amoscoso\Documents\Technion\nlp\nlp_hw\tests') # %% data = PreprocessTags(True).load_data(r'..\data\train.wtag') word_num = 500 # generate tests - (comment out if file is updated) feat_generator = Features() feat_generator.generate_tuple_corpus(data.x[0:word_num], data.y[0:word_num]) for template in feat.templates_dict.values(): feat_generator.generate_lambdas(template['func'], template['tuples']) feat_generator.save_tests() test_data = PreprocessTags(True).load_data(r'..\data\test.wtag') # %% word_num = 500 test_number = 50 model1 = Model() model1.fit(data.x[0:word_num], data.y[0:word_num]) y_hat = model1.predict(test_data.x[:test_number]) model1.confusion(y_hat, data.y[:test_number])
# coding: utf-8 # import necessary dependencies and files import os import sys import tensorflow as tf import numpy as np from models.config import DATASET_PATH, SAVED_FEATURES from models.features import Features # Load in the datasets features = Features(data_dir=DATASET_PATH) if os.path.isfile(SAVED_FEATURES): datasets = np.load(SAVED_FEATURES) else: datasets = features.create(save_file=SAVED_FEATURES) # Split into training and testing set X_train, y_train, X_test, y_test = features.train_test_split(datasets) print('Length of training set: {:,}'.format(len(y_train))) print('Length of testing set: {:,}'.format(len(y_test))) # Define Hyperparameters # Image & labels image_size = features.image_size image_channel = 3 image_shape = (image_size, image_size, image_channel)
class TrainPredictDuration: def __init__(self, config_path, model_name): model_props = load_properties(config_path, model_name) self.non_categorical_features = model_props['non_categorical_features'] self.target = model_props['target'] self.test_size = float(model_props['test_size']) self.model_output_filepath = model_props['model_output_filepath'] self.model_result_filepath = model_props['model_result_filepath'] self.features_filename = model_props['features_filename'] self.Features = Features(config_path, model_name) self.subscr_type = model_props['subscr_type'] self.drop_columns = model_props['drop_columns'] self.drop_rows = model_props['drop_rows'] def data_preparation(self): cursor = make_connection() df = load_df( cursor, """select aa.*, bb.municipal as start_municipal, bb.lat as start_lat, bb.lng as start_lng,\ cc.municipal as end_municipal, cc.lat as end_lat, cc.lng as end_lng from hubway_trips as aa \ left join hubway_stations as bb on aa.strt_statn = bb.id \ left join hubway_stations as cc on aa.end_statn = cc.id""" ) weather_df = load_df(cursor, """select * from weather""") df[[ 'duration', 'birth_date', 'start_lat', 'start_lng', 'end_lat', 'end_lng' ]] = df[[ 'duration', 'birth_date', 'start_lat', 'start_lng', 'end_lat', 'end_lng' ]].apply(pd.to_numeric) weather_df['hpcp'] = pd.to_numeric(weather_df['hpcp']) df[['start_date', 'end_date']] = df[['start_date', 'end_date']].apply(pd.to_datetime) weather_df['date_time'] = weather_df['date_time'].apply(pd.to_datetime) df = df[(df['subsc_type'] == self.subscr_type)] df = df[(df['duration'] < df['duration'].quantile(0.75)) & (df['duration'] > 0)] df.dropna(subset=[col for col in self.drop_rows.split(',')], inplace=True) def weather_hpcp(df, weather_df, date): new_col_name = date.split("_")[0] + '_hpcp' tol = pd.Timedelta(days=3) df = pd.merge_asof(df.sort_values(by=date), weather_df[['date_time', 'hpcp']].sort_values( by='date_time').set_index('date_time'), right_index=True, direction='nearest', tolerance=tol, left_on=date) df.rename(columns={'hpcp': new_col_name}, inplace=True) df[new_col_name] = df[new_col_name].groupby( [df[date].dt.month]).transform(lambda x: x.fillna(x.mean())) return df df = weather_hpcp(df, weather_df, "start_date") df = weather_hpcp(df, weather_df, "end_date") return df def feature_engineering(self, df): df['driver_age'] = df.apply( lambda x: self.Features.driver_age(x['birth_date']), axis=1) df['driver_age_cat'] = df.apply( lambda x: self.Features.driver_age_category(x['driver_age']), axis=1) df['travel_distance'] = df.apply(lambda x: self.Features.distance( x['start_lat'], x['start_lng'], x['end_lat'], x['end_lng']), axis=1) df['average_speed'] = df.apply(lambda x: self.Features.average_speed( x['travel_distance'], x['duration']), axis=1) df = self.Features.temporal_features(df) df = self.Features.one_hot_encoding(df) df['is_station_diff'] = self.Features.strt_end_diff(df) df = self.Features.station_flows(df) for col in df.columns: if df[col].isna().sum() != 0: print(col) df.to_csv(os.path.join("data", self.features_filename + '.csv'), index=False) return df def feature_selection(self, df): non_impact_features = [ 'is_start_11Q4', 'is_adult', 'is_start_weekend', 'is_start_9', 'is_start_12Q2', 'is_start_4', 'is_start_17', 'is_young_adult', 'is_start_working_day', 'is_start_12Q3' ] non_categorical_features = [ feature for feature in self.non_categorical_features.split(",") ] X = df[non_categorical_features + list(df.filter(regex='is_').columns)] # X = X.drop(X[non_impact_features], axis=1) feature_names = list(X.columns) y = df[self.target] return X, y, feature_names def train_test_split(self, X, y): X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=self.test_size, random_state=42) return X_train, X_test, y_train, y_test def grid_search_train(self, X, y, estimator, param): param_grid = { "rf_{}".format(self.subscr_type): { # "n_estimators": list(range(20, 81, 10)), "n_estimators": [80], "bootstrap": ['True'], "criterion": ['mse'], "max_features": ['auto', 'sqrt'], "min_samples_leaf": [5] }, "lr_{}".format(self.subscr_type): { "normalize": ['True'], "alpha": [0.01, 0.02, 0.03, 0.04], }, "gb_{}".format(self.subscr_type): { "n_estimators": list(range(20, 81, 10)), "learning_rate": [0.01, 0.02, 0.03, 0.04], "min_samples_split": [500], "min_samples_leaf": [50], "max_depth": [4, 6, 8, 10], "max_features": ['auto'], "subsample": [0.9, 0.5, 0.2, 0.1] } } print(param_grid[param]) X_train, X_test, y_train, y_test = self.train_test_split(X, y) grid_search = GridSearchCV(estimator, param_grid[param], cv=5, n_jobs=-1) grid_search.fit(X_train, y_train) model = grid_search.best_estimator_ predictions = model.predict(X_test) return model, predictions, y_test def save_model(self, model, model_name): full_path = os.path.join(self.model_output_filepath, model_name) pickle.dump(model, open(full_path, 'wb')) def save_output(self, y_pred_test, y_test, output_name): results = { "y_pred_test": y_pred_test.tolist(), "y_test": y_test.tolist() } with open(os.path.join(self.model_result_filepath, output_name), 'w') as ofile: json.dump(results, ofile) def post_processing(self): data_file_path = "data/" + self.features_filename + ".csv" ddl_file_path = "ddls/" + self.features_filename + "_table_create.sql" cursor = make_connection() with open(ddl_file_path, "r") as file_obj: sql_statement = file_obj.read() try: if SQLHandler().execute_ddl(cursor, sql_statement): print("Table Created") else: print("Skipping table creation") except IOError as f_ex: print("File {} not accessible, Error message : {}".format( ddl_file_path, f_ex)) try: if SQLHandler().ingest_csv(cursor, data_file_path, self.features_filename): print("Data Ingested") else: print("Please Check CSV File") except Exception as ex: print("Unable to ingest : {}".format(data_file_path, ex)) return
class FinkMos: def __init__(self, x, y, tag_corpus): assert isinstance(x, pd.Series) self.tag_corpus = tag_corpus self.test_dict = Features().get_tests() self.test_vec = np.array([test['func'][1] for test in self.test_dict.values()]) self.x = x self.y = y self.f_matrix_list = None # self.linear_loss_done = None # self.word2number = {word: index for index, word in enumerate(x.value_counts().index)} # tc = tag_corpus.shape[0] self.fast_test = dict() self.fast_predict = dict() self.weight_mat = None self.tuple_5_list = None self.tup5_2index = dict() self.opt = None self.v = None self.f_v_train = None self.calc_from_mem = None def create_tuples(self): """ tuple handling :Create: tuple_5_list (list of 5 tuple combinations) weight_mat (number of occurences of each tuple in dataset) :return: :rtype: """ tx_0 = self.x.values ty_0 = self.y.values tx_1 = np.roll(tx_0, 1) tx_2 = np.roll(tx_1, 1) ty_1 = np.roll(ty_0, 1) ty_2 = np.roll(ty_1, 1) tuple_6_np = ty_0 + "_" + ty_1 + "_" + ty_2 + "_" + tx_0 + "_" + tx_1 + "_" + tx_2 tuple_6_counts_series = pd.Series(tuple_6_np).value_counts() tuple_5_df = pd.DataFrame([ty_1, ty_2, tx_0, tx_1, tx_2]).T tuple_5_df.sort_values([0, 1, 2, 3, 4], inplace=True) # sort the 5 tuple_list by tuple_5_df.drop_duplicates(inplace=True, keep='first') # remove duplicates self.tuple_5_list = list(map(lambda x: list(x[1]), tuple_5_df.iterrows())) # make list of every row of the DF # create wight mask weight_mask = spar.csr_matrix((self.tag_corpus.shape[0], tuple_5_df.shape[0]), dtype=int) self.tup5_2index = {"_".join(x): num for num, x in enumerate(tuple_5_df.values)} for tup, count in tuple_6_counts_series.items(): tup_0 = tup.split('_')[0] tup_5 = '_'.join(tup.split('_')[1:]) ind_j = self.tup5_2index[tup_5] itemindex = np.where(self.tag_corpus == tup_0) ind_i = itemindex[0] weight_mask[ind_i, ind_j] = count self.weight_mat = weight_mask def create_feature_sparse_list_v2(self, training_fm=None): # return a list of sparse matrices, each matrix # tuple_5_list = self.tuple_5_list tuple_5_size = len(self.tuple_5_list) # tuple_0_list = self.tag_corpus # [[elem1], [elem2], ...] -> tuple_0_size = self.tag_corpus.shape[0] num_test = len(self.test_dict) # returns a list of empty spars matrices result = [spar.csr_matrix((tuple_5_size, num_test), dtype=bool) for _ in range(tuple_0_size)] # iterate list of test names if self.y is None: # inference mode calculated = spar.csr_matrix((tuple_5_size, tuple_0_size), dtype=int) for tup_5_ind, tup5 in enumerate(self.tuple_5_list): # if calculated before take value tup_5_str = ('_').join(tup5) if tup_5_str in training_fm.tup5_2index: # TODO: _get instead of in ind_in_train = training_fm.tup5_2index[tup_5_str] calculated[tup_5_ind, :] = training_fm.f_v_train[:, ind_in_train] continue for tup_0_ind, tup0 in enumerate(self.tag_corpus): tup = (tup0,) + tuple(tup5) result[tup_0_ind][tup_5_ind, :] = np.array([test(tup) for test in self.test_vec]) self.calc_from_mem = calculated else: for test_ind, (key, val) in enumerate(self.test_dict.items()): # iterate list of tuples per test for tup in set(val['tup_list']): tup_0_ind = np.where(tup[0] == self.tag_corpus)[0][0] tup_5_ind = self.tup5_2index['_'.join(tup[1:])] result[tup_0_ind][tup_5_ind, test_ind] = True self.f_matrix_list = result def loss_function(self, v): f_v = self.dot(v) # add factor f_v_mask = self.weight_mat.multiply(f_v) l_fv = np.sum(np.sum(f_v_mask)) # * mask exp_ = np.exp(f_v) exp_sum = np.sum(exp_, axis=0) repetitions = np.array(self.weight_mat.sum(axis=0)) # from here not sparse ln = np.log(exp_sum) * repetitions sum_ln = np.sum(ln) return sum_ln - l_fv # + 0.1 * np.linalg.norm(v) def loss_gradient(self, v): f_v = self.dot(v) # dims: tup_0 x tup5 e_f_v = np.exp(f_v) # dims: tup0 x tup5 z = np.sum(e_f_v, axis=1) + 1e-11 # dims: tup0 x tup5 p = (e_f_v.T / z).T # dims: tup0 x tup5 f_p_tup5_list = [] # sum over tuples list f_v_tup_0_tests = [] for tup_0_ind, sparse_matrix in enumerate(self.f_matrix_list): spar_t = sparse_matrix.T # Left weight_vec = self.weight_mat[tup_0_ind, :] weighted_slice = spar.csr_matrix.multiply(spar_t, weight_vec) f_v_tests = weighted_slice.sum(axis=1) f_v_tup_0_tests.append(f_v_tests) # Right f_p = spar.csr_matrix.multiply(spar_t, p[tup_0_ind, :]) # dims: tup5 x tests f_p_tup5_list.append(f_p) sparce_list = sum(f_p_tup5_list) sparce_list_w_weight = spar.csr_matrix.multiply(sparce_list, self.weight_mat.sum(axis=0)) right = np.squeeze(np.array(sparce_list_w_weight.sum(axis=1))) left = np.array(f_v_tup_0_tests) # dims 1 X dim(V) left_sum = np.squeeze(np.array(np.sum(left, axis=0))) regularization = 0.2 * v result = left_sum - right # - regularization neg_result = - result return neg_result def dot(self, v): results = [] for sparce_matrix in self.f_matrix_list: t = sparce_matrix.dot(v) results.append(t) return np.array(results) def minimize_loss(self): self.opt = minimize(self.loss_function, np.ones(len(self.test_dict)), jac=self.loss_gradient, options=dict(disp=True, maxiter=15, # eps=1e-5, # gtol= 1e-6 ), method='CG', callback=self.callback_cunf) self.v = self.opt.x self.f_v_train = self.dot(self.v) def callback_cunf(self, x): print(f'Current loss {self.loss_function(x)}') def prob_q2(self, v, y_token, training_fm): self.create_feature_sparse_list_v2(training_fm) # creates f_matrix_list f_v = self.dot(v) + self.calc_from_mem.T # dims tup0 x tup5 y_nomin = np.array(f_v[y_token]) # dims tup5 x 1 exp_ = np.array(np.exp(f_v)).squeeze() exp_sum = np.sum(exp_, axis=0) # dims tup5 x 1 prob = np.array(y_nomin / (exp_sum+1e-10))[0] # dims tup5 x 1 return prob