def main(mode='train'): train_path = 'data/train.csv' test_path = 'data/test.csv' data_train = data_loader.preprocess_data(train_path) data_test = data_loader.preprocess_data(test_path) data_X, data_Y = get_label_data(data_train) if mode in 'train': train_X, train_Y, test_X, test_Y = data_train_test_split( data_X, data_Y) print("Model in traning........") clf = train_random_forest_classifier(train_X=train_X, train_Y=train_Y, test_X=test_X, test_Y=test_Y) save_model(clf, 'model/rft-model.pkl') del clf clf = load_model('model/rft-model.pkl') print('\nValidation of model(kfold)') validate_model_kfold(clf, data_X, data_Y) print("Write predicted value to disk") generate_submission(clf, data_test=data_test)
def main(): train_path = 'data/train.csv' test_path = 'data/test.csv' train_X, train_Y = data_loader.preprocess_data(train_path, data_mode='train') data_test, _ = data_loader.preprocess_data(test_path, data_mode='test') train_X, train_Y, test_X, test_Y = data_train_test_split(train_X, train_Y) model = train_xgbooster(train_X=train_X, train_Y=train_Y, test_X=test_X, test_Y=test_Y) save_model(model, 'model/xgb-model.pkl') generate_submission(model, data_test)
def find_my_chances(gpa, gmat, age, race, university, major, gender): # create list of strings to trigger the applicant profile parsing gpa_str = "{} GPA".format(gpa) gmat_str = "{} GMAT".format(gmat) demo_str = "{a} year old {r} {g}".format(a=age, r=race, g=gender) school_info = "Degree in {m} at {uni} (University)".format(m=major, uni=university) app_profile = [gpa_str, gmat_str, demo_str, school_info] odds = "" for school in TARGET_LABELS: odds += "{}: 0.0\n".format(school) ap = ApplicantProfile(app_profile, odds) d = {} d["GMAT"] = ap.gmat_score d["GPA"] = ap.gpa d["UNIVERSITY"] = ap.uni d["MAJOR"] = ap.major d["JOBTITLE"] = ap.job_title d["GENDER"] = ap.gender d["RACE"] = ap.race d["AGE"] = ap.age d["INTERNATIONAL"] = ap.international d["ODDS"] = ap.odds.encode('utf-8').strip() df = pd.DataFrame(d, index=[0]) schooldata_dict, mycolnames = preprocess_data(df) print("\n {d}".format(d=d)) for school, indf in schooldata_dict.items(): # if missing any columns from training set, add them w/ dummy vals for col in colnames: if col not in indf['features'].columns: indf['features'][col] = 0.0 features_df = indf['features'][colnames] # print(features_df) df2predictfrom = features_df.values df2predictfrom = np.delete(df2predictfrom, 0, axis=1) try: chance = MODELS[school]['model'].predict(df2predictfrom) except KeyError as ke: print("No model for {}".format(school)) try: pass #print("Coefficients: {}".format(MODELS[school].coef_)) except AttributeError as ae: continue if school in ['Harvard', 'Wharton', 'Stanford', 'Booth']: print("{s} odds: {c}".format(s=school, c=chance))
def loading(self, load_total, sys_ticks, data_file='data/loads_model.xls', filter_col='BMS编号'): '''加载负载数据,并合并到load_pre 默认为load.xls ''' if self.obj == None: fp.output_msg( "The load's data has not loading because of the load model had not created!" ) #读取数据文件 self.load_data = fp.read_load_file(self.load_type, data_file) if data_file == 'data/loads_model.xls': data_d = False else: data_d = True #对数据进行预处理 self.load_data = dp.preprocess_data(self.load_data, self.sys_settings, data_d, col_sel=filter_col, row_sel='010101030613001F') self.loads_on(load_total, sys_ticks)
def prepare_df(self, preproc_type='yeo-johnson'): files = os.listdir('.') if self.data_csv is None: # data_folder, clean_data_folder, flow_val_file, transform = True, flows = 'mean', feature_columns = None clean_data_folder = '_'.join([self.data_folder, 'transformed']) samples_data = preprocess_data(self.data_folder, clean_data_folder, self.flow_values, preproc_type, transform=True) success, df = get_data_and_labels(clean_data_folder) if success: self.data_csv = df self.x_data_cols = [ col for col in self.data_csv.columns if col not in ['Flow', 'SampleName', 'FlowClass'] ] return True else: return False else: # make sure the flow class labels correspond return True
def convert_and_process(): # convert_ecgs() convert_xmls() data_x, data_y, fnames = dgen.get_data( return_fnames=True, location=cfg.converted_data_location) processed_data_x = dprep.preprocess_data(data_x) dprep.save_data(processed_data_x, data_y, cfg.processed_data_location, fnames) save_pulse_data()
def __init__(self,args): self.training_iters=args.training_iters self.display_steps=args.display_steps processed_data=preprocess_data(image_path=args.img_path, caption_path=args.caption_path,sample_size=args.sample_size,size=args.size, num_channels=args.num_channels) self.train, self.train_captions,self.vocab_size=processed_data.get_data() self.x_caption=tf.placeholder(tf.float32,shape=[None, self.vocab_size],name='x_caption') self.x_inp=tf.placeholder(tf.float32, shape=[1, args.size[0], args.size[1], args.num_channels], name='x_input') self.y=tf.placeholder(tf.float32, shape=[None,self.vocab_size], name='y_image') mod=model(self.vocab_size,args.bridge_size,self.x_caption, self.x_inp, self.y,args.size) self.cost, self.optimizer, self.accuracy=mod.full_model(learning_rate=0.0001)
def look_at_and_pre_process_data(data, rawdata, variables): # Now plot the input data. show_data = raw_input("Show the raw-data? (1 = YES): ") if show_data == '1': pl.plot_data(data, variables) # Preprocess the data (mean-centering, normalization). text_1 = "Pre-process the data (ENTER = normalization AND mean centering, " text_2 = "1 = JUST mean centering, 0 = None): " these_processes = raw_input(text_1 + text_2) data, rawdata = dp.preprocess_data(these_processes, data, rawdata) # "Enhance" certain variables to put all their influence into one component. text = "Enhance variables? As integers: Variable_1, Variable_2, ... ; ENTER = none): " enhance_these = raw_input(text) data, rawdata = dp.boost_variables(enhance_these, data, rawdata)
def load_data_tensors(data_file, num_examples=None): """ Read data from a CSV file, and convert into lookup tensors pointing to tokens in the text. """ raw_data = pd.read_csv(data_file) if num_examples is not None: raw_data = raw_data.head(num_examples) # extract the slot infromation into separate columns data_columns = build_slot_columns(raw_data) # add the slot columns into the dataframe data = pd.concat([raw_data, data_columns], axis=1) data = preprocess_data(data) new_mr = reconstruct_mr(data, data_columns.columns) data['new_mr'] = new_mr data['new_mr'] = data['new_mr'].apply(add_space_to_punctuation) input_tensor, mr_word2idx, mr_idx2word = tokenize(data['new_mr']) target_tensor, ref_word2idx, ref_idx2word = tokenize(data['ref']) return input_tensor, target_tensor, ref_word2idx, ref_idx2word, mr_word2idx, mr_idx2word
from data_preprocessing import preprocess_data from tensorflow.keras.layers import * from tensorflow.keras.models import Model sequence_length = 30 buffer_size = 1024 batch_size = 32 training_dataset, test_dataset, data_std, data_mean, n_state_labels, n_county_labels = preprocess_data( sequence_length=sequence_length, buffer_size=buffer_size, batch_size=batch_size) def build_model(): state_emb_input = Input(shape=(1, )) county_emb_input = Input(shape=(1, )) data_input = Input(shape=(sequence_length, 3)) state_emb = Embedding(input_dim=n_state_labels, output_dim=8, input_length=1)(state_emb_input) county_emb = Embedding(input_dim=n_county_labels, output_dim=8, input_length=1)(county_emb_input) embedding = Concatenate()([state_emb, county_emb]) embedding = Flatten()(embedding) embedding = Dense(sequence_length, activation='linear')(embedding) embedding = Reshape((sequence_length, 1))(embedding) concat = Concatenate()([data_input, embedding])
item_category_map[item].add(category) count = cur.execute( "select `id` from `book` where `id` not in(" "select distinct `book_id` from `read_record`) " "and `id` not in(select distinct `book_id` from `buy_record`)") result = cur.fetchmany(count) all_book_not_been_read = [] for row in result: all_book_not_been_read.append(row[0]) cur.close() conn.commit() conn.close() except Exception, e: print Exception, ':', e preprocessed_dataset = data_preprocessing.preprocess_data(dataset) mp = MostPopular(preprocessed_dataset) mp.calc_item_popularity() mp_by_ratio = MPByCategoryRatio(preprocessed_dataset, item_category_map) mp_by_ratio.calc_item_popularity() user_cf = HieraKmeansUserCF(preprocessed_dataset, item_category_map, n_sample, max_iter) user_cf.calc_user_sim() mutex.acquire() PREPROCESSED_DATASET = preprocessed_dataset ALL_BOOK_NOT_BEEN_READ = all_book_not_been_read MP = mp MP_BY_RATIO = mp_by_ratio USER_CF = user_cf mutex.release()
#%% import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler from sklearn.decomposition import PCA from data_preprocessing import preprocess_data #%% seasons = np.arange(1998,2018,1) tourney, regseason, sub = preprocess_data(seasons) # tourney columns: # - unusable: # 'Season', 'DayNum', 'T1ID', 'T2ID', 'T1PtsF', 'T2PtsF', 'T1PtsA', 'T2PtsA', 'T2Result', 'WDeltaRatio' # - usable but meh: # 'NumOT', 'T1Games', 'T2Games', 'GamesDiff', 'GamesRatio', 'RatingRatio' # - usable: # 'T1Seed', 'T2Seed', 'SeedDiff', 'SeedRatio', 'T1Loc', # 'T1Rating', 'T1PtsFor', 'T1PtsAgainst', 'T1PtsDelta', 'T1WRatio', 'T1WDelta', 'T1WPyt', 'T1WWRatio', # 'T2Rating', 'T2PtsFor', 'T2PtsAgainst', 'T2PtsDelta', 'T2WRatio', 'T2WDelta', 'T2WPyt', 'T2WWRatio', # 'RatingDiff', 'PtsForDiff','PtsForRatio', 'PtsAgainstDiff', 'PtsAgainstRatio', 'PtsDeltaDiff', 'PtsDeltaRatio', # 'WRatioDiff', 'WRatioRatio', 'WDeltaDiff', 'WPytDiff', 'WPytRatio', # 'WWRatioDiff', 'WWRatioRatio' # - label: # 'T1Result' #%% tourney.plot(x='SeedDiff', y='Result', kind='scatter') plt.show()
combined_df = input_data_df.append(other_data_df) combined_df.reset_index(inplace=True) # catboost_data_dict = preprocess_data_4_catboost(data_df=input_data_df) # for school, catboostpool in catboost_data_dict.items(): # predicted_labels = catboost_pred(catboostpool) # labels = catboostpool.get_label() # display_metrics("Catboost for {}".format(school),predicted_labels,labels) school_data_dict, colnames = preprocess_data(data_df=combined_df, output_path=OUT_FILE_PATH) print(colnames) MODELS = {} # would use iteritems, but what if i want to port to python 3.5 for school, feature_label_d in school_data_dict.items(): features = feature_label_d['features'].values labels = feature_label_d['labels'].values print("Number of Samples for {}: {}\n".format(school, features.shape[0])) # drop indices from the model features = np.delete(features, 0, axis=1) # test model against train data. we are using ALL of the data for training. # Not splitting for cross validation because the dataset for each school is TINY
import matplotlib.pyplot as plt from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error, r2_score import data_preprocessing as dp TRAIN_DATA_PATH = './data/train.csv' TEST_DATA_PATH = './data/test.csv' x_train, y_train = dp.preprocess_data(TRAIN_DATA_PATH) x_test, y_test = dp.preprocess_data(TEST_DATA_PATH) regressor = LinearRegression() model = regressor.fit(x_train, y_train) y_pred = regressor.predict(x_test) print ('Score: %.2f' % model.score(x_test, y_test)) print ('Variance Score: %.2f' % r2_score(y_test, y_pred))
from sklearn.ensemble import RandomForestClassifier from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis from tabulate import tabulate import data_preprocessing as preprocessed_data import pandas as pd import sys import os if '__file__' not in globals(): sys.path.append( os.getcwd() + '/Machine_Learning_A-Z_Mine/Part 7 - Natural Language Processing/Section 36 - Natural Language Processing' ) # %% codecell # Preprocess the data before algorithm is applied x_train, x_test, y_train, y_test = preprocessed_data.preprocess_data() # %% codecell # Create Presentation Table Structure columns = ['Accuracy', 'Precision', 'Sensitivity', 'F1 Score'] index = [] data = [] def add_to_table(cm, algorithm, data, index): index.append(algorithm) tn, fp, fn, tp = cm.ravel() accuracy = round((tp + tn) / (tp + tn + fp + fn), 3) precision = round(tp / (tp + fp), 3) sensitivity = round(tp / (tp + fn), 3) f1_score = round(2.0 * precision * sensitivity / (precision + sensitivity),
def main(): # data = preprocess_bad_file('result.tsv') # save_preprocessed_file(data, 'result.cropped.tsv') data = read_data('result.cropped.tsv') data = preprocess_data(data)