def __init__(self, first_day, starting_cash): self.market_positions = [ MarketPosition(DataUtils.get_date_from_row(first_day), DataUtils.get_low_from_row(first_day), starting_cash) ] self.cash = 0
def create_xy_train(self, tag_file, embedding_file, data_size=1, look_back=5, threshold=0, suffix=None, mode="create", load=None): DataUtils.message("Prepearing Training Data...", new=True) if mode == "create" or mode == "save": x_train, y_train = self.__create_xy_train(tag_file, embedding_file, data_size, look_back, threshold, suffix) if mode == "save": DataUtils.save_array( DataUtils.get_filename("ULSTM_X", "TRAIN" + "_" + str(look_back)), x_train) DataUtils.save_array( DataUtils.get_filename("ULSTM_Y", "TRAIN" + "_" + str(look_back)), y_train) if mode == "load" and load is not None: x_train = DataUtils.load_array(load[0]) y_train = DataUtils.load_array(load[1]) self.x_train = x_train self.y_train = y_train self.INPUT_SHAPE = x_train.shape self.OUTPUT_SHAPE = y_train.shape
def main(): data_utils = DataUtils() clf_utils = ClassifierUtils() decision_documents, decision_labels = data_utils.load_decision_data() disagreement_documents, disagreement_labels = data_utils.load_disagreement_data( ) clf_metadata = { 'type': 'RF', 'n_estimators': 500, 'max_depth': 128, 'n_jobs': 8 } features_metadata = { 'type': 'count', 'use_sw': True, 'use_length': False, 'binary': False, 'normalize': False, 'append_binary': False, 'sampling': None } metrics = clf_utils.cross_validate(disagreement_documents, disagreement_labels, clf_metadata, features_metadata, num_splits=5) embed()
def run(self, data_row, cash_infusion, sell_out=False): high = DataUtils.get_high_from_row(data_row) low = DataUtils.get_low_from_row(data_row) date = DataUtils.get_date_from_row(data_row) if cash_infusion > 0: self.cash += cash_infusion self.amount_to_invest_per_day = self.cash / 30 if self.cash > 0: if self.cash > self.amount_to_invest_per_day: new_market_position = MarketPosition( date, low, self.amount_to_invest_per_day) self.cash -= self.amount_to_invest_per_day else: new_market_position = MarketPosition(date, low, self.cash) self.cash = 0 self.market_positions.append(new_market_position) balance = 0 if sell_out: for mp in self.market_positions: balance += mp.sell(date, high) else: for mp in self.market_positions: balance += mp.current_value(high) return balance
def create_xy_test(self, tag_file, embedding_file, data_size=1, look_back=5, suffix=None, mode="create", load=None): DataUtils.message("Prepearing Test Data...", new=True) if mode == "create" or mode == "save": x_test, y_test = self.__create_xy_test(tag_file, embedding_file, data_size, look_back, suffix) if mode == "save": DataUtils.save_array( DataUtils.get_filename("ULSTM_X", "TEST" + "_" + str(look_back)), x_test) DataUtils.save_array( DataUtils.get_filename("ULSTM_Y", "TEST" + "_" + str(look_back)), y_test) if mode == "load" and load is not None: x_test = DataUtils.load_array(load[0]) y_test = DataUtils.load_array(load[1]) self.x_test = np.array(x_test) self.y_test = np.array(y_test)
def create(self): DataUtils.message("Creating The Model...", new=True) word_input = Input(shape=(self.look_back, 300)) tag_input = Input(shape=(self.look_back, )) tag_emb = Embedding(self.distinct_tags + 1, 30, input_length=self.look_back, mask_zero=True, trainable=False)(tag_input) concat_emb = Concatenate()([word_input, tag_emb]) bilstm = Bidirectional( LSTM(300, dropout=0.35, recurrent_dropout=0.1, return_sequences=True))(concat_emb) hidden = TimeDistributed(Dense(800, activation="tanh"))(bilstm) output = TimeDistributed( Dense(self.distinct_words, activation="softmax"))(hidden) model = Model(inputs=[word_input, tag_input], outputs=output) model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy']) self.model = model
def create_xy_test(self, tag_file, embedding_file, data_size=1, window_size=5, available_tags=[], suffix=None, mode="create", load=None): DataUtils.message("Prepearing Test Data...", new=True) if mode == "create" or mode == "save": x_test, y_test = self.__create_xy(tag_file, embedding_file, data_size, window_size, available_tags, suffix) if mode == "save": DataUtils.save_array( DataUtils.get_filename("SFF", "X_TEST" + "_" + str(window_size)), x_test) DataUtils.save_array( DataUtils.get_filename("SFF", "Y_TEST" + "_" + str(window_size)), y_test) if mode == "load" and load is not None: x_test = DataUtils.load_array(load[0]) y_test = DataUtils.load_array(load[1]) self.x_test = np.array(x_test) self.y_test = np.array(y_test)
def create_xy_train(self, tag_file, embedding_file, data_size=1, window_size=5, available_tags=[], suffix=None, mode="create", load=None): DataUtils.message("Prepearing Training Data...", new=True) if mode == "create" or mode == "save": x_train, y_train = self.__create_xy(tag_file, embedding_file, data_size, window_size, available_tags, suffix) if mode == "save": DataUtils.save_array( DataUtils.get_filename("SFF", "X_TRAIN" + "_" + str(window_size)), x_train) DataUtils.save_array( DataUtils.get_filename("SFF", "Y_TRAIN" + "_" + str(window_size)), y_train) if mode == "load" and load is not None: x_train = DataUtils.load_array(load[0]) y_train = DataUtils.load_array(load[1]) self.x_train = np.array(x_train) self.y_train = np.array(y_train) self.INPUT_SHAPE = self.x_train.shape self.OUTPUT_SHAPE = self.y_train.shape
def create_xy_train(self, parse_tree_file, data_size=1, seq_len=10): DataUtils.message("Prepearing Training Data...", new=True) x_train, y_train = self.__create_xy(parse_tree_file, data_size, seq_len) self.x_train = x_train self.y_train = y_train
def create_xy_train(self, dependency_tree, embedding_file, data_size=1, look_back=0, mode="create", load=None): DataUtils.message("Prepearing Training Data...", new=True) if mode == "create" or mode == "save": word_train, tag_train, probability_train = self.__create_xy(dependency_tree, embedding_file, data_size, look_back, test=False) self.word_train = word_train self.tag_train = tag_train self.probability_train = probability_train
def run(): print(device_lib.list_local_devices()) configuration = Configuration('configuration/configuration.cfg') DataUtils.check_and_create_folders(configuration) DataUtils.create_cache_if_not_exists(configuration) recognition = Recognition(configuration) recognition.train()
def save(self, note=""): DataUtils.message("Saving Model...", new=True) directory = "weights/" DataUtils.create_dir(directory) file = DataUtils.get_filename("UFF", note)+".h5" self.model.save(directory+file)
def create(self): DataUtils.message("Creating The Model...", new=True) input_forward = Input(shape=(self.seq_len, )) input_backward = Input(shape=(self.seq_len, )) head_forward = Input(shape=(self.seq_len, )) head_backward = Input(shape=(self.seq_len, )) word_embedding = Embedding(self.distinct_words, 128, input_length=self.seq_len, trainable=True) input_forward_embedding = word_embedding(input_forward) input_backward_embedding = word_embedding(input_backward) head_forward_embedding = word_embedding(head_forward) head_backward_embedding = word_embedding(head_backward) lstm_forward = LSTM(128) lstm_backward = LSTM(128) input_forward_lstm = lstm_forward(input_forward_embedding) input_backward_lstm = lstm_backward(input_backward_embedding) input_lstm = Concatenate()([input_forward_lstm, input_backward_lstm]) head_forward_lstm = lstm_forward(head_forward_embedding) head_backward_lstm = lstm_backward(head_backward_embedding) head_lstm = Concatenate()([head_forward_lstm, head_backward_lstm]) tag_output = Dense(18, activation="softmax")(input_lstm) input_hidden = Dense(100, activation=None) input_forward_hidden = input_hidden(input_lstm) head_hidden = Dense(100, activation=None) head_forward_hidden = head_hidden(head_lstm) sum_hidden = Add()([input_forward_hidden, head_forward_hidden]) tanh_hidden = Activation("tanh")(sum_hidden) arc_output = Dense(1, activation=None)(tanh_hidden) model = Model(inputs=[ input_forward, input_backward, head_forward, head_backward ], outputs=[tag_output, arc_output]) def nll1(y_true, y_pred): # keras.losses.binary_crossentropy give the mean # over the last axis. we require the sum return K.sum(K.binary_crossentropy(y_true, y_pred), axis=-1) model.compile(loss=['categorical_crossentropy', nll1], optimizer="adam", metrics=['accuracy']) self.model = model
def plot(self, note=""): DataUtils.message("Ploting Model...", new=True) directory = "plot/" DataUtils.create_dir(directory) file = DataUtils.get_filename("UFF", note)+".png" plot_model(self.model, to_file=directory+file, show_shapes=True, show_layer_names=False)
def __init__(self): self.num_classes = 2 self.resnet50_weights = os.path.realpath('models/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5') self.xception_weights = os.path.realpath('models/xception_weights_tf_dim_ordering_tf_kernels_notop.h5') self.model_output_path = os.path.realpath('data/model_output.h5') self.model_path = {'resnet50': os.path.realpath('data/model_resnet50.h5'), 'xception': os.path.realpath('data/model_xception.h5')} self.transfer_classifiers = {'resnet50': (ResNet50, self.resnet50_weights), 'xception': (Xception, self.xception_weights)} self.du = DataUtils()
def create_xy_test(self, parse_tree_file, data_size=1, seq_len=10): DataUtils.message("Prepearing Validation Data...", new=True) x_test, y_test = self.__create_xy(parse_tree_file, data_size, seq_len, test=True) self.x_test = x_test self.y_test = y_test
def train(self, epochs, batch_size=32): DataUtils.message("Training...", new=True) self.model.fit([ self.word_train[0][0], self.word_train[0][1], self.tag_train[0][0], self.tag_train[0][1], self.word_train[1][0], self.word_train[1][1], self.tag_train[1][0], self.tag_train[1][1] ], self.head_train, epochs=epochs, batch_size=batch_size)
def test(self, classifier, model=None): du = DataUtils() X_test, y_test = du.data_preprocess('test') pred = self.predict(X_test, classifier, model) y_pred = np.zeros(len(pred), dtype=int) y_pred[pred[:, 1] > pred[:, 0]] = 1 score = metrics.accuracy_score(y_test[:, 1], y_pred) logger_tc.info('test accuracy: %.3f' % score) with h5py.File(self.model_output_path) as model_output: if '%s_test_pred' % classifier not in model_output: model_output.create_dataset('%s_test_pred' % classifier, data=pred)
def __init__(self, reports_directory, src_server, src_index, src_type): self.data_loader_utils_dest = DataLoaderUtils(src_server, src_index, src_type) self.reports_directory = reports_directory self.src_server = src_server self.src_index = src_index self.src_type = src_type self.delete_tags = True self.delete_annotations = True self.data_utils = DataUtils()
def normal_experiment(args): test_size = 0.3 du = DataUtils(args.name) pos, neg, bk, clauses, lang = du.load_data() pos_train, pos_test = train_test_split(pos, test_size=test_size, random_state=7014) neg_train, neg_test = train_test_split(neg, test_size=test_size, random_state=7014) pos_train_, neg_train_ = get_dataset_with_noise(pos_train, neg_train, noise_rate=args.noise_rate) if args.name == 'member': beam_step = 3 N_beam = 3 elif args.name == 'subtree': beam_step = 3 N_beam = 15 else: beam_step = 5 N_beam = 10 N_max = 50 N = 1 ilp_train = ILPProblem(pos_train_, neg_train_, bk, lang, name=args.name) ilp_train.print() CG = ClauseGenerator(ilp_train, infer_step=args.T, max_depth=1, max_body_len=1) solver = ILPSolver(ilp_train, C_0=clauses, CG=CG, m=args.m, infer_step=args.T) clauses_, Ws_list, loss_list_list = solver.train_N(N=N, gen_mode='beam', N_max=N_max, T_beam=beam_step, N_beam=N_beam, epoch=args.epoch, lr=args.lr, wd=0.0) v_list, facts = solver.predict_N(pos_test, neg_test, clauses_, Ws_list) mse = compute_mse(pos_test, neg_test, v_list[0], facts) auc = compute_auc(pos_test, neg_test, v_list[0], facts) print('====== TEST SCORE =======') print('Mean-squared test error: ', mse) print('AUC: ', auc)
def __create_xy_test(self, tag_file, embedding_file, data_size=1, look_back=5, suffix=None): x_test = [] y_test = [] corpus = DataUtils.load_corpus(tag_file) tag_emb = DataUtils.create_onehot_vectors( DataUtils.extract_tag_list(corpus)) word_emb = DataUtils.load_embeddings(embedding_file) if suffix is not None: word_emb = DataUtils.add_suffix_embeddings(word_emb, suffix[0], suffix[1]) words, tags = DataUtils.extract_data(corpus) word_keys = DataUtils.normalize_cases(word_emb.keys(), words) data_size = int(len(words) * min(data_size, 1)) - int( len(words) * min(data_size, 1)) % look_back for idx in np.arange(0, data_size, look_back): x_timestep = [] y_timestep = [] for jdx in range(look_back): word_input = word_emb[word_keys[idx + jdx]] if word_keys[ idx + jdx] in word_emb else word_emb["UNK"] tag_input = tag_emb[tags[idx + jdx]] if (jdx == 0): x_timestep = [word_input] y_timestep = [tag_input] else: x_timestep = np.append(x_timestep, [word_input], axis=0) y_timestep = np.append(y_timestep, [tag_input], axis=0) x_timestep = np.array(x_timestep) y_timestep = np.array(y_timestep) if (idx == 0): x_test = [x_timestep] y_test = [y_timestep] else: x_test = np.append(x_test, [x_timestep], axis=0) y_test = np.append(y_test, [y_timestep], axis=0) if idx % int(data_size / (10 * look_back)) == 0: DataUtils.update_message(str(int(idx / data_size * 100))) x_test = np.array(x_test) y_test = np.array(y_test) return x_test, y_test
def get_all_data(batch_size, sentence_len, word2idx, label2idx, fold_num): utils = DataUtils(batch_size=batch_size, sentence_len=sentence_len, word2idx=word2idx, label2idx=label2idx) # 开发集 develop_sentences, develop_labels = utils.get_train_data( "./data/", mode='develop_') develop_idx_x_batches, develop_y_batches, develop_word_len_batches = utils.encoder_data2idx_batch( develop_sentences, develop_labels) # 测试集 test_sentences, test_labels = utils.get_train_data("./data/", mode='test_') test_idx_x_batches, test_y_batches, test_word_len_batches = utils.encoder_data2idx_batch( test_sentences, test_labels) # 训练集 train_sentences, train_labels = utils.get_train_data("./data/", mode='train_') # 训练集的5折 k_fold_x_train, k_fold_y_train, k_fold_x_test, k_fold_y_test = DataUtils.k_fold( train_sentences, train_labels, fold_num) # k 代表 训练集切分出来的数据 k_train_idx_x_batches_list, k_train_y_batches_list, k_train_word_len_batches_list = [], [], [] k_develop_idx_x_batches_list, k_develop_y_batches_list, k_develop_word_len_batches_list = [], [], [] if fold_num != 1: for fold_idx in range(fold_num): k_train_idx_x_batches, k_train_y_batches, k_train_word_len_batches = utils.encoder_data2idx_batch( k_fold_x_train[fold_idx], k_fold_y_train[fold_idx]) k_train_idx_x_batches_list.append(k_train_idx_x_batches) k_train_y_batches_list.append(k_train_y_batches) k_train_word_len_batches_list.append(k_train_word_len_batches) k_develop_idx_x_batches, k_develop_y_batches, k_develop_word_len_batches = utils.encoder_data2idx_batch( k_fold_x_test[fold_idx], k_fold_y_test[fold_idx]) k_develop_idx_x_batches_list.append(k_develop_idx_x_batches) k_develop_y_batches_list.append(k_develop_y_batches) k_develop_word_len_batches_list.append( k_develop_word_len_batches) else: k_train_idx_x_batches, k_train_y_batches, k_train_word_len_batches = utils.encoder_data2idx_batch( k_fold_x_train[0], k_fold_y_train[0]) k_train_idx_x_batches_list.append(k_train_idx_x_batches) k_train_y_batches_list.append(k_train_y_batches) k_train_word_len_batches_list.append(k_train_word_len_batches) return k_train_idx_x_batches_list, k_train_y_batches_list, k_train_word_len_batches_list, \ k_develop_idx_x_batches_list, k_develop_y_batches_list, k_develop_word_len_batches_list, \ develop_idx_x_batches, develop_y_batches, develop_word_len_batches, \ test_idx_x_batches, test_y_batches, test_word_len_batches,
def export_doc_ids(server, src_index, src_type, query=None): print __name__, 'Fetching doc ids for', server, src_index, src_type if query is None: query = { "match_all": {} } data_utils = DataUtils() ids = data_utils.batch_fetch_ids_for_query(base_url=server, index=src_index, type=src_type, query=query) documents_ids = dict.fromkeys(ids, None) print __name__, 'Done, fetched', len(documents_ids), 'doc ids' return documents_ids
def __init__(self, config=config_reader()): """ read model param """ self.rnn_mode = config['rnn_mode'] self.batch_size = config['batch_size'] self.embedding_dim = config['embedding_dim'] self.num_layers = config['num_layers'] self.num_units = config['num_utils'] self.FCNN_num_units = config['FCNN_num_units'] self.learning_rate = config['learning_rate'] self.max_epoch = config['max_epoch'] self.keep_prob = config['keep_prob'] self.model_path = config['model_path'] self.logs_file = config['logs_file'] self.end_loss = config['end_loss'] self.save_model_name = config['save_model_name'] self.print_step = config['print_step'] self.save_epoch = config['save_epoch'] self.data_utils = DataUtils() self.vocab = self.data_utils.vocab self.chunk_size = self.data_utils.chunk_size self.global_step = tf.Variable(0, trainable=False, name='global_step') self.increment_global_step_op = tf.assign(self.global_step, self.global_step + 1)
def parse_date(self, test_string): test_string = DataUtils.remove_excess_spaces(test_string) # First, try to parse the date according the the specified format parsed_date = self.parse_date_string(test_string) if parsed_date != None: return parsed_date, parsed_date try: # If that fails, try to parse the date as a date range string return daterangeparser.parse(test_string) except pyparsing.ParseException: # If that fails, it may be a date range in a format that daterangeparser doesn't recognize # Check if the string contains two formatted dates by checking the beginning and end substrings # until it finds two strings formatted like dates test_start = len(test_string) - 1 test_end = 0 start = None end = None while test_end < len(test_string): if start == None: start = self.parse_date_string(test_string[0:test_end]) if end == None: end = self.parse_date_string( test_string[test_start:len(test_string)]) if start != None and end != None: break test_start -= 1 test_end += 1 if start == None or end == None: raise ValueError('Could not parse date string: ' + test_string) return start, end
def get_experiment_feedback(session_id, robot_id): global data_thread experiment_ongoing = True feedback_received = False black_box_id = BBUtils.get_bb_id(robot_id) robot_smart_wheel_count = config.get_robot_smart_wheel_count(robot_id) diagnostic_vars = DataUtils.expand_var_names( experiment_diagnostic_vars, robot_smart_wheel_count) zyre_communicator.reset_experiment_feedback(robot_id) while experiment_ongoing: feedback_msg = zyre_communicator.get_experiment_feedback(robot_id) if feedback_msg and feedback_msg['robot_id'] == robot_id: feedback_received = True experiment_ongoing = send_experiment_feedback( robot_id, feedback_msg, feedback_received) if experiment_ongoing: with data_thread_lock: if not data_thread: data_thread = threading.Thread( target=send_diagnostic_data, kwargs={ 'session_id': session_id, 'black_box_id': black_box_id, 'diagnostic_vars': diagnostic_vars }) data_thread.start() global feedback_thread feedback_thread = None
def get_download_query(): '''Responds to a data download query by sending a query to the appropriate black box and then saving the data to a temporary file for download. ''' robot_id = request.args.get('robot_id', '', type=str) black_box_id = BBUtils.get_bb_id(robot_id) variable_list = request.args.get('variables').split(',') start_query_time = request.args.get('start_timestamp') end_query_time = request.args.get('end_timestamp') query_msg = DataUtils.get_bb_query_msg(session['uid'].hex, black_box_id, variable_list, start_query_time, end_query_time) query_result = zyre_communicator.get_query_data(query_msg) message = '' try: with open(query_result_file_path, 'w') as download_file: json.dump(query_result, download_file) return jsonify(success=True) except Exception as exc: print('[get_download_query_robot_data] %s' % str(exc)) message = 'Data could not be retrieved' return jsonify(message=message)
def load_data(self): self.du = DataUtils(self.config.training_file, self.config.testing_file, self.config.batch_size) self.X_train = self.du.train_images self.y_train = self.du.train_labels self.X_val = self.du.val_images self.y_val = self.du.val_labels self.X_test = self.du.test_images self.y_test = self.du.test_labels
def main(path, graphics): t = DataUtils(path) train, test = t.train, t.test for _t in train + test: inp, out = _t['input'], _t['output'] inp, out = np.asarray(inp), np.asarray(out) output_array = solve(inp, out, graphics) print(output_array)
def create(self): DataUtils.message("Creating The Model...", new=True) word_input_forward = Input(shape=(self.look_back,300)) word_input_backward = Input(shape=(self.look_back,300)) tag_input_forward = Input(shape=(self.look_back,)) tag_input_backward = Input(shape=(self.look_back,)) tag_emb = Embedding(self.distinct_tags, 30, input_length=self.look_back, trainable=True) tag_input_forward_output = tag_emb(tag_input_forward) tag_input_backward_output = tag_emb(tag_input_backward) input_forward = Concatenate()([word_input_forward, tag_input_forward_output]) input_backward = Concatenate()([word_input_backward, tag_input_backward_output]) word_head_forward = Input(shape=(self.look_back,300)) word_head_backward = Input(shape=(self.look_back,300)) tag_head_forward = Input(shape=(self.look_back,)) tag_head_backward = Input(shape=(self.look_back,)) tag_head_forward_output = tag_emb(tag_head_forward) tag_head_backward_output = tag_emb(tag_head_backward) head_forward = Concatenate()([word_head_forward, tag_head_forward_output]) head_backward = Concatenate()([word_head_backward, tag_head_backward_output]) bilstm = BiLSTM(300) bilstm_input = bilstm([input_forward,input_backward]) dense_input = Dense(600, activation="linear")(bilstm_input) bilstm_head = bilstm([head_forward,head_backward]) dense_head = Dense(600, activation="linear")(bilstm_head) sum_dense = Add()([dense_input,dense_head]) dense_tanh = Dense(600, activation="tanh")(sum_dense) output = Dense(1, activation="softmax")(dense_tanh) model = Model(inputs=[word_input_forward, word_input_backward, tag_input_forward, tag_input_backward, word_head_forward, word_head_backward, tag_head_forward, tag_head_backward], outputs=output) model.compile(loss='binary_crossentropy', optimizer="adam", metrics=['accuracy']) self.model = model
__author__ = 'guoliangwang' from data_utils import DataUtils from sklearn.preprocessing import StandardScaler import numpy as np parkinson_features = "MDVP_Fo.Hz.,MDVP_Fhi.Hz.,MDVP_Flo.Hz.,MDVP_Jitter...,MDVP_Jitter.Abs.,MDVP_RAP,MDVP_PPQ,Jitter_DDP,MDVP_Shimmer,MDVP_Shimmer.dB.,Shimmer_APQ3,Shimmer_APQ5,MDVP_APQ,Shimmer_DDA,NHR,HNR,RPDE,DFA,spread1,spread2,D2,PPE" parkinson_data_util = DataUtils(parkinson_features, "data_sets", "parkinson_clean.csv", "parkinson_testing.csv") parkinson_clean_inputs = parkinson_data_util.training_inputs() print('clean_inputs: ', parkinson_clean_inputs) stdsc = StandardScaler() clean_standard_inputs = stdsc.fit_transform(parkinson_clean_inputs) # write the standardized data to csv np.savetxt("parkinson_clean_standard_data_python.csv", clean_standard_inputs, delimiter=",") # print("clean stadard inputs: ", clean_standard_inputs.values)
import numpy as np import timeit import seaborn as sb from plot_utils import PlotUtils from data_utils import DataUtils import matplotlib.pyplot as plt from sklearn.cross_validation import StratifiedKFold from sklearn.cross_validation import cross_val_score from sklearn.metrics import accuracy_score from sklearn.grid_search import GridSearchCV from sklearn.tree import DecisionTreeClassifier import sklearn.tree as tree wisconsin_features = "Clump.Thickness,Uniformity.of.Cell.Size,Uniformity.of.Cell.Shape,Marginal.Adhesion,Single.Epithelial.Cell.Size,Bare.Nuceoli,Bland.Chromatin,Normal.Nucleoli,Mitoses" wisconsin_data_util = DataUtils(wisconsin_features, "data_sets", "wisconsin_training.csv", "wisconsin_testing.csv") wisconsin_training_inputs = wisconsin_data_util.training_inputs() wisconsin_training_classes = wisconsin_data_util.training_classes() wisconsin_testing_inputs = wisconsin_data_util.testing_inputs() wisconsin_testing_classes = wisconsin_data_util.testing_classes() cross_validation = StratifiedKFold(wisconsin_training_classes, n_folds=5) plot_utils = PlotUtils() ## Decision Tree below decision_tree_classifier = DecisionTreeClassifier(random_state=0) plot_tree = DecisionTreeClassifier(random_state=0, max_depth=6, max_features=1) cv_scores = cross_val_score(plot_tree, wisconsin_training_inputs, wisconsin_training_classes, cv=5) # sb.distplot(cv_scores)
import seaborn as sb from plot_utils import PlotUtils from data_utils import DataUtils import matplotlib.pyplot as plt from sklearn.cross_validation import StratifiedKFold from sklearn.cross_validation import cross_val_score from sklearn.grid_search import GridSearchCV from sklearn.metrics import accuracy_score from sklearn.tree import DecisionTreeClassifier import sklearn.tree as tree decision_tree_classifier = DecisionTreeClassifier(random_state=0) parkinson_features = "MDVP_Fo.Hz.,MDVP_Fhi.Hz.,MDVP_Flo.Hz.,MDVP_Jitter...,MDVP_Jitter.Abs.,MDVP_RAP,MDVP_PPQ,Jitter_DDP,MDVP_Shimmer,MDVP_Shimmer.dB.,Shimmer_APQ3,Shimmer_APQ5,MDVP_APQ,Shimmer_DDA,NHR,HNR,RPDE,DFA,spread1,spread2,D2,PPE" parkinson_data_util = DataUtils(parkinson_features, "data_sets", "parkinson_clean_normal_training.csv", "parkinson_clean_normal_testing.csv") parkinson_training_inputs = parkinson_data_util.training_inputs() parkinson_training_classes = parkinson_data_util.training_classes() parkinson_testing_inputs = parkinson_data_util.testing_inputs() parkinson_testing_classes = parkinson_data_util.testing_classes() ## plot for data distribution print("mean: ", np.mean(parkinson_training_inputs)) # decision_tree_classifier.fit(parkinson_training_inputs, parkinson_training_classes) # score1 = decision_tree_classifier.score(parkinson_testing_inputs, parkinson_testing_classes) # print("score1: ", score1) cross_validation = StratifiedKFold(parkinson_training_classes, n_folds=5) plot_utils = PlotUtils()