def test_vgg16(): ds = DataSet(data_path='data/', shuffle=True, balance=True, categorical=True, padd=True, combine=3) r = VGG16_net() epochs_count = 10 history, net, clf = train(ds.X_train, ds.y_train, ds.X_test, ds.y_test, r, epochs=epochs_count) #plot_training(history, net, epochs_count) if predict(ds.X_test, ds.y_test, net): full_ds = DataSet(data_path='data/', padd=True, combine=3) #submission(net, None, full_ds, network=True, trained=True) submmiss_clf(net, clf, full_ds)
def train_test_model(log_dir, hparams: dict): dataset = DataSet(fraction=1.0) optimiser = getattr(tf.keras.optimizers, hparams['optimizer']) schedule = scheduler(hparams, dataset) model = SequentialCNN(input_shape=dataset.input_shape(), output_shape=dataset.output_shape()) model.compile(loss=tf.keras.losses.categorical_crossentropy, optimizer=optimiser(learning_rate=hparams['learning_rate']), metrics=['accuracy']) history = model.fit( dataset.data['train_X'], dataset.data['train_Y'], batch_size=hparams["batch_size"], epochs=250, verbose=False, validation_data=(dataset.data["valid_X"], dataset.data["valid_Y"]), callbacks=[ EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=hparams['patience']), schedule, tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1), # log metrics hp.KerasCallback(log_dir, hparams) # log hparams ]) print({key: value[-1] for key, value in history.history.items()})
def main(): ds = DataSet() with open(INPUT_DATA_FILE, "r", newline='', encoding="utf8") as csv_file: ds.extract_from_csv(csv_file) # X_train, X_test, y_train, y_test = train_test_split(ds.X, ds.y, test_size=0.3, random_state=1) clf = MLPClassifier(algorithm='l-bfgs', max_iter=50, alpha=1e-6, hidden_layer_sizes=10000, random_state=1) # classifier = clf.fit(X_train, y_train) print(cross_val_score(clf, ds.X, ds.y, cv=10, n_jobs=-1))
def execute_online_pipeline( input_file, system_name, features, training_start_date, validation_start_date, test_start_date): """Execute one independent run (the data is loaded again).""" utils.collect_garbage() time_label = ( str(validation_start_date.date()) + ' to ' + str(test_start_date.date()) ) queuelogger.set_context(time_label, system_name) data = loading.load_df(input_file, featurelist.get_columns(features)) # The revision ids computed here slightly differ from the values in the # file constants.py. However, both computations result in exactly the same # training and validation set. The reason for different revision ids is # that the corpus does not contain bot revisions while the revision ids in # the constants file include bot revisions. training_start_index = DataSet.get_index_for_date_from_df(data, training_start_date) validation_start_index = DataSet.get_index_for_date_from_df(data, validation_start_date) test_start_index = DataSet.get_index_for_date_from_df(data, test_start_date) # _logger.debug('Training start revisionId: %s' % str(data.loc[training_start_index]['revisionId'])) # _logger.debug(data.loc[training_start_index-5:training_start_index+5,['revisionId', 'timestamp']]) # _logger.debug('Validation start revisionId: %s' % str(data.loc[validation_start_index]['revisionId'])) # _logger.debug(data.loc[validation_start_index-5:validation_start_index+5,['revisionId', 'timestamp']]) # _logger.debug('Test start revisionId: %s' % str(data.loc[test_start_index]['revisionId'])) # _logger.debug(data.loc[test_start_index-5:test_start_index+5,['revisionId', 'timestamp']]) data = data[0: test_start_index] # preprocessing transformation does not have to be applied to the whole data set fit_slice = slice(0, validation_start_index) data = preprocessing.fit_transform( time_label, system_name, data, features, fit_slice) training = data[training_start_index:validation_start_index] validation = data[validation_start_index:test_start_index] if validation.get_system_name() == 'WDVD': metrics = classification.bagging_and_multiple_instance( training, validation, print_results=False) else: metrics = classification.default_random_forest( training, validation, print_results=False) metrics = metrics.reorder_levels(['Dataset', 'System', 'Classifier']) metrics[('ALL', 'VANDALISM_FRACTION')] = validation.get_vandalism_fraction() _print_metrics(metrics)
def setUp(self): self.init_column_names = ["feat_001", "X-coord", "h,std,dev", "Grade"] self.init_features = [[0., 0.1, 3.], [0., 0.2, 0.], [0., 0.3, 0.5]] self.init_classes = ["G3", "G1", "G1"] self.expected_extracted_column_names = self.init_column_names self.expected_extracted_features = [[1.7, 3., 0.09], [-5., -1.12, 0.]] self.expected_extracted_classes = ["G2", "G3"] self.data_set_dir = path.join(path.dirname(__file__), "data_sets") self.data_set = DataSet(X=self.init_features, y=self.init_classes, col_names=self.init_column_names)
def process_data(method, path='data/'): print('Starting the process') ds = DataSet(method, full=True) folder_path = path + str(method.__name__) if not os.path.exists(folder_path): os.mkdir(folder_path) print(ds.X_train.shape) print(ds.X_test.shape) print(ds.y_train.shape) np.save(os.path.join(folder_path, 'X_train.npy'), ds.X_train) np.save(os.path.join(folder_path, 'X_test.npy'), ds.X_test) y_train = ds.mapper.inverse_transform(ds.y_train) with open(os.path.join(folder_path, 'y_train.csv'), 'w') as file: file.write('id,scene_label\n') for n, label in enumerate(y_train): file.write('{:d},{:s}\n'.format(n, label)) print('\nFiles have been saved into: {}'.format(folder_path))
def get_models_data(self): interval = np.linspace(1, .1, num=self.__interval_n, dtype=np.float) container = dict() for model in self.__models: container[str(model)] = ModelData(model, self.__test_n) for method in self.__methods: for inter in interval: train_size = round(inter * 0.8, 3) test_size = round(inter * 0.2, 3) ds = DataSet(method, shuffle_data=True, test_size=test_size, train_size=train_size) results = evaluate(deepcopy(models), ds, n=self.__test_n, debug=False) for result in results: container[result] += {method.__name__: results[result]} return container
def scheduler(hparams: dict, dataset: DataSet): if hparams['scheduler'] is 'constant': return LearningRateScheduler(lambda epocs: hparams['learning_rate'], verbose=False) if hparams['scheduler'] is 'linear_decay': return LearningRateScheduler( lambda epocs: max(hparams['learning_rate'] * (10. / (10. + epocs)), min_lr), verbose=False) if hparams['scheduler'].startswith('CyclicLR')\ or hparams['scheduler'] in ["triangular", "triangular2", "exp_range"]: # DOCS: https://www.datacamp.com/community/tutorials/cyclical-learning-neural-nets # CyclicLR_triangular, CyclicLR_triangular2, CyclicLR_exp_range mode = re.sub(r'^CyclicLR_', '', hparams['scheduler']) # step_size should be epoc multiple between 2 and 8, but multiple of 2 (= full up/down cycle) if hparams['patience'] <= 6: whole_cycles = 1 # 1/2 = 0.5 | 6/2 = 3 elif hparams['patience'] <= 12: whole_cycles = 2 # 8/4 = 2 | 12/4 = 3 elif hparams['patience'] <= 24: whole_cycles = 3 # 14/6 = 2.3 | 24/6 = 4 elif hparams['patience'] <= 36: whole_cycles = 4 # 26/8 = 3.25 | 36/8 = 4.5 elif hparams['patience'] <= 48: whole_cycles = 5 # 28/10 = 2.8 | 48/10 = 4.8 elif hparams['patience'] <= 72: whole_cycles = 6 # 50/12 = 4.2 | 72/12 = 6 elif hparams['patience'] <= 96: whole_cycles = 8 # 74/16 = 4.6 | 96/16 = 6 else: whole_cycles = 12 # 100/24 = 4.2 | 192/24 = 8 return CyclicLR(mode=mode, step_size=dataset.epoc_size() * (hparams['patience'] / (2.0 * whole_cycles)), base_lr=min_lr(hparams), max_lr=hparams['learning_rate']) if hparams['scheduler'].startswith('plateau'): factor = int((re.findall(r'\d+', hparams['scheduler']) + [10])[0]) # plateau2 || plateau10 (default) if 'sqrt' in hparams['scheduler']: patience = math.sqrt( hparams['patience']) # plateau2_sqrt || plateau10__sqrt else: patience = hparams['patience'] / 2.0 return ReduceLROnPlateau( monitor='val_loss', factor=1 / factor, patience=math.floor(patience), # min_lr = min_lr(hparams), verbose=False, ) print("Unknown scheduler: ", hparams)
def submission(model, method): """Trains the model with the full data set and saves the predicted labels to a submission file.""" ds = DataSet(method, full=True) model.fit(ds.X_train, ds.y_train) y_pred = model.predict(ds.X_test) save_submission(str(model), ds.mapper.inverse_transform(y_pred))
def get_splitting_indices(data, use_test_set): training_set_start = constants.TRAINING_SET_START if use_test_set: validation_set_start = constants.TEST_SET_START test_set_start = constants.TAIL_SET_START else: validation_set_start = constants.VALIDATION_SET_START, test_set_start = constants.TEST_SET_START # transform revision id to index in data set training_set_start = DataSet.get_index_for_revision_id_from_df( data, training_set_start) validation_set_start = DataSet.get_index_for_revision_id_from_df( data, validation_set_start) test_set_start = DataSet.get_index_for_revision_id_from_df( data, test_set_start) return training_set_start, validation_set_start, test_set_start
def main(): ds = DataSet() with open(INPUT_DATA_FILE, "r", newline='', encoding="utf8") as csv_file: ds.extract_from_csv(csv_file) print("Ranking (descending)", ds.create_features_ranking(use_names=True)) experiment_results = {} final_counter = Counter() for layer_size in HIDDEN_LAYER_SIZES: experiment_results[layer_size] = {} for n_features in range(1, ds.number_of_features, 1): result = run_experiment(ds.X, ds.y, hidden_layer_size=layer_size, n_features=n_features) experiment_results[layer_size][n_features] = result final_counter.update(result.counter) print_result(result, layer_size, n_features) print("\nNum of times features were selected: {}".format(final_counter)) generate_plots(experiment_results, ds.number_of_features, ds.col_names, final_counter)
def test_rnn(): ds = DataSet(features.mfcc_spec, data_path='data/', shuffle=True, balance=True, categorical=True) r = RNN1() epochs_count = 1 history, net, clf = train(ds.X_train, ds.y_train, ds.X_test, ds.y_test, r, epochs=epochs_count) plot_training(history, net, epochs_count) if predict(ds.X_test, ds.y_test, net): ds_sub = DataSet(full=True) submission(net, None, ds_sub, network=True, trained=True)
def build_dataset(df, y): _logger.debug('building dataset...') _logger.debug('slicing...') _logger.debug('meta...') n_meta = len(featurelist.get_meta_list()) new_meta = df.iloc[:, 0:n_meta] _logger.debug('X...') new_X = df.iloc[:, n_meta:].values # takes a looong time features = df.columns[n_meta:] _logger.debug('y...') new_Y = y.values utils.collect_garbage() _logger.debug('dataset...') new_data = DataSet() _logger.debug('set_meta...') new_data.set_meta(new_meta) _logger.debug('set_X...') new_data.set_X(new_X) _logger.debug('set_Y...') new_data.set_Y(new_Y) _logger.debug('set_features...') new_data.set_features(features) _logger.debug('building dataset...done.') return new_data
def compute_data_frame(data): _logger.debug("Splitting statistics...") training_set_start_index = 0 # compute statistics from start of dataset validation_set_start_index = \ DataSet.get_index_for_revision_id_from_df(data, constants.VALIDATION_SET_START) test_set_start_index = \ DataSet.get_index_for_revision_id_from_df(data, constants.TEST_SET_START) tail_set_start_index = \ DataSet.get_index_for_revision_id_from_df(data, constants.TAIL_SET_START) training_set = data[ training_set_start_index:validation_set_start_index] validation_set = data[validation_set_start_index:test_set_start_index] test_set = data[test_set_start_index:tail_set_start_index] result = [] result.append( compute_splitting_statistics_row(training_set, 'Training')) result.append( compute_splitting_statistics_row(validation_set, 'Validation')) result.append(compute_splitting_statistics_row(test_set, 'Test')) result = pd.concat(result, axis=0) return result
def test_should_raise_error_on_feature_size_mismatch(self): with self.assertRaises(RuntimeError) as cm: DataSet(X=[[1], [1, 2]]) with self.assertRaises(RuntimeError): DataSet(y=[1]) with self.assertRaises(RuntimeError): DataSet(X=[[1], [1, 2]], y=[1]) with self.assertRaises(RuntimeError): DataSet(X=[[1], [1, 2]], y=[1, 1]) with self.assertRaises(RuntimeError): DataSet(X=[[1], [1, 2]], y=[1, 1, 1]) with self.assertRaises(RuntimeError): DataSet(col_names=["a"]) with self.assertRaises(RuntimeError): DataSet(X=[[1, 2], [1, 2]], y=[1, 1], col_names=["a"]) with open(path.join(DATA_SETS_DIR, "mock_data_set_corrupted.csv"), "r", newline='', encoding="utf8") as csv_file: with self.assertRaises(RuntimeError): self.data_set.extract_from_csv(csv_file)
print('{:s}: {:.6f}'.format(str(model), score)) # Take the average over all the measurements mean_meta = dict() for key in meta: mean_meta[key] = np.mean(meta[key]) results[str(model)] = mean_meta return results if __name__ == '__main__': # Create the dataset ds = DataSet(features.mean_over_time, shuffle=True) # Add/Remove tested models here. models = [ SVM_model(), SVM_model('linear'), SVM_model('poly'), LR_model(), KNN_model(), RFC_model(), LDA_model() ] print(evaluate(models, ds, debug=True)) # add different path for data files:
from src.utils import argparser, logging logfile = './logs/parallax-tf/logs.txt' try: os.makedirs(os.path.dirname(logfile)) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(os.path.dirname(logfile)): pass else: raise FLAGS = argparser() FLAGS.is_training = False train_dataset = DataSet(fpath=FLAGS.train_file, seqlen=FLAGS.seq_len, n_classes=FLAGS.num_classes, need_shuffle=False) test_dataset = DataSet(fpath=FLAGS.test_file, seqlen=FLAGS.seq_len, n_classes=FLAGS.num_classes, need_shuffle=False) FLAGS.charset_size = train_dataset.charset_size FLAGS.sync = True resource_info = os.path.abspath( os.path.join(os.path.dirname(__file__), '.', FLAGS.resource_info_file)) single_graph = tf.Graph() with single_graph.as_default(): ops, global_step = get_placeholders(FLAGS)
def main(files): utils.print_system_info() utils.init_pandas() _logger.info("FILES=" + str(files)) # Load feature file for some statistics features = featurelist.get_meta_list() + featurelist.get_label_list() df = loading.load_df(files, featurelist.get_columns(features)) test_set_start = DataSet.get_index_for_revision_id_from_df( df, constants.TEST_SET_START) tail_set_start = DataSet.get_index_for_revision_id_from_df( df, constants.TAIL_SET_START) df = df[test_set_start:tail_set_start] data = DataSet() data.set_meta(df.iloc[:, :-1]) data.set_Y(df.iloc[:, -1].astype(np.float32)) data.set_X(np.zeros((len(data), 1))) _logger.debug("Length of data: " + str(len(data))) # Load scores scores = pd.DataFrame() scores[REVISION_ID] = data.get_revision_ids() scores.set_index(REVISION_ID, inplace=True) for team, score_file in files['teams'].items(): team_scores = load_vandalism_scores(score_file) team_scores.set_index(REVISION_ID, inplace=True) scores[team] = team_scores[VANDALISM_SCORE] scores.dropna(inplace=True) if len(data) != len(scores): raise Exception( "number of scores does not fit test set size: " + "len(data)={0} but len(scores)={1}".format(len(data), len(scores))) _logger.debug("Length of scores: " + str(len(data))) # Evaluate teams meta_scores = compute_meta_scores(scores) scores = pd.concat([scores, meta_scores], axis=1) evaluate_teams(scores, data, save_scores=['META']) evaluate_teams_over_time(scores, data, EVALUATION_OVER_TIME_SUFFIX) scores, data = clean_data(scores, data) evaluate_teams(scores, data, suffix=EVALUATION_RESULTS_CLEANED_SUFFIX)
from src.dataset import DataSet if __name__ == '__main__': FLAGS = argparser() FLAGS.is_training = False logfile = "./logs/tensorflow/log.txt" try : os.makedirs(os.path.dirname(logfile)) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(os.path.dirname(logfile)): pass else : raise train_dataset = DataSet( fpath=FLAGS.train_file, seqlen=FLAGS.seq_len, n_classes=FLAGS.num_classes, need_shuffle=True ) test_dataset = DataSet( fpath=FLAGS.test_file, seqlen=FLAGS.seq_len, n_classes=FLAGS.num_classes, need_shuffle=True ) FLAGS.charset_size = train_dataset.charset_size ops, global_step = get_placeholders( FLAGS ) seq = ops['data'] label = ops['labels'] logits, _ = inference(seq, FLAGS) tf.losses.softmax_cross_entropy(label, logits) loss = tf.losses.get_total_loss()
) except KeyError: print('False kernel') self.name = 'SVM-{:s}'.format(kernel) def __str__(self): """Returns the name of the model""" return self.name if __name__ == '__main__': import sys sys.path.append('.') from param_test import ParamTester from src.dataset import DataSet import src.feature_extraction as fe ds = DataSet(method=fe.mfcc_spec, shuffle=True) test = ParamTester(ds, SVM_model('poly'), iter_method={'C': 100}, debug=True) test.run() test.save_results() test.plot()
"Adagrad", # Best with LR=0.1 + triangular (slow/best) or plateau2 (quick) "SGD", # Best with LR=1 + triangular2 ### learning_rate vs optimizer + scheduler=constant | needs learning_rate=0.1 | random until 16 epocs, then quickly converges "Ftrl", # Only works with: LR=0.1 + plateau2/constant OR LR=1 + CyclicLR_triangular ]), "scheduler": hp.Discrete([ # 'constant', # 'linear_decay', 'plateau2', 'plateau2_sqrt', 'plateau10', 'plateau10_sqrt', 'CyclicLR_triangular', 'CyclicLR_triangular2', 'CyclicLR_exp_range' ]), } if __name__ == "__main__": dataset = DataSet(fraction=1.0) model = SequentialCNN(input_shape=dataset.input_shape(), output_shape=dataset.output_shape()) log_dir = "../../../logs/convergence_search" stats_history = hparam_search.hparam_search(hparam_options, model, dataset, log_root=log_dir, verbose=argv.verbose)
def _compute_backpressure_statistics(data): # Restrict computation to test dataset test_set_start_index = \ DataSet.get_index_for_revision_id_from_df(data, constants.TEST_SET_START) tail_set_start_index = \ DataSet.get_index_for_revision_id_from_df(data, constants.TAIL_SET_START) data = data[test_set_start_index:tail_set_start_index] data = data[[ REVISION_ID, ITEM_ID, USER_NAME, REVISION_ACTION, ROLLBACK_REVERTED ]] REVISION_ID_INDEX = 0 # noqa ITEM_ID_INDEX = 1 USER_NAME_INDEX = 2 REVISION_ACTION_INDEX = 3 ROLLBACK_REVERTED_INDEX = 4 # noqa data = data.values result = np.full(len(data), np.nan) revealed = pd.DataFrame() for i in range(len(data)): user_name = data[i][USER_NAME_INDEX] item_id = data[i][ITEM_ID_INDEX] prev_rev = data[i] for j in range(i + 1, min(len(data), i + 16)): rev = data[j] if rev[ITEM_ID_INDEX] == item_id: # Rollback within same session (same item id and same user name) if rev[USER_NAME_INDEX] == user_name: if rev[REVISION_ACTION_INDEX] == 'rollback': result[i] = True revealed = revealed.append(pd.Series(prev_rev), ignore_index=True) break # Rollback at beginning of next session else: if rev[REVISION_ACTION_INDEX] == 'rollback': result[i] = True revealed = revealed.append(pd.Series(prev_rev), ignore_index=True) break else: result[i] = False revealed = revealed.append(pd.Series(prev_rev), ignore_index=True) break n_revisions = result.size n_revealed_total = (~(np.isnan(result))).sum() n_revealed_regular = (result == True).sum() # noqa n_revealed_vandalism = (result == False).sum() # noqa _logger.info('n_revisions: ' + str(n_revisions)) _logger.info('n_revealed_total: ' + str(n_revealed_total)) _logger.info('n_revealed_vandalism: ' + str(n_revealed_vandalism)) _logger.info('n_revealed_regular: ' + str(n_revealed_regular))
def omit_holdout_df(df): """Omit the holdout dataframe.""" tail_set_start_index = \ DataSet.get_index_for_revision_id_from_df(df, constants.TAIL_SET_START) df = df[:tail_set_start_index] return df
priors=None, n_components=None, store_covariance=False, tol=1.0e-4) self.name = 'LDA' def __str__(self): return self.name if __name__ == '__main__': import sys sys.path.append('.') from param_test import ParamTester from src.dataset import DataSet from src.feature_extraction import mean_over_time ds = DataSet(method=mean_over_time) solver_list = ['lsqr', 'eigen'] test = ParamTester(ds, LDA_model(), iter_method={'shrinkage': 10}, debug=True) test.run() test.save_results() #test.plot()
def __init__(self, opts): self.dir = opts.dir self.report_every_steps = opts.train['report_every_steps'] self.validation_every_steps = opts.train['validation_every_steps'] self.checkpoint_every_steps = opts.train['checkpoint_every_steps'] self.train_steps = opts.train['train_steps'] self.vocab = Vocab(opts.cfg['vocab']) self.cuda = opts.cfg['cuda'] self.n_steps_so_far = 0 self.average_last_n = opts.train['average_last_n'] self.steps = opts.train['steps'] V = len(self.vocab) N = opts.cfg['num_layers'] d_model = opts.cfg['hidden_size'] d_ff = opts.cfg['feedforward_size'] h = opts.cfg['num_heads'] dropout = opts.cfg['dropout'] factor = opts.cfg['factor'] label_smoothing = opts.cfg['label_smoothing'] warmup_steps = opts.cfg['warmup_steps'] lrate = opts.cfg['learning_rate'] beta1 = opts.cfg['beta1'] beta2 = opts.cfg['beta2'] eps = opts.cfg['eps'] batch_size = opts.train['batch_size'] max_length = opts.train['max_length'] swap_bitext = opts.train['swap_bitext'] self.sim_run = self.steps['sim']['run'] p_uneven = self.steps['sim']['p_uneven'] sim_pooling = self.steps['sim']['pooling'] R = self.steps['sim']['R'] align_scale = self.steps['sim']['align_scale'] self.p_mask = self.steps['mlm']['p_mask'] self.r_same = self.steps['mlm']['r_same'] self.r_rand = self.steps['mlm']['r_rand'] if 1.0 - self.r_same - self.r_rand <= 0.0: logging.error('r_mask={} <= zero'.format(1.0 - self.r_same - self.r_rand)) sys.exit() self.model = make_model(V, N=N, d_model=d_model, d_ff=d_ff, h=h, dropout=dropout) if self.cuda: self.model.cuda() self.optimizer = NoamOpt( d_model, factor, warmup_steps, torch.optim.Adam(self.model.parameters(), lr=lrate, betas=(beta1, beta2), eps=eps)) if self.steps['sim']['run']: if self.steps['sim']['pooling'] == 'align': self.criterion = AlignSIM() else: self.criterion = CosineSIM() else: #self.criterion = CrossEntropy(padding_idx=self.vocab.idx_pad) self.criterion = LabelSmoothing(size=V, padding_idx=self.vocab.idx_pad, smoothing=label_smoothing) if self.cuda: self.criterion.cuda() self.load_checkpoint() #loads if exists if self.sim_run: self.computeloss = ComputeLossSIM(self.criterion, sim_pooling, R, align_scale, self.optimizer) else: self.computeloss = ComputeLossMLM(self.model.generator, self.criterion, self.optimizer) token = OpenNMTTokenizer(**opts.cfg['token']) logging.info('read Train data') self.data_train = DataSet(self.steps, opts.train['train'], token, self.vocab, sim_run=self.sim_run, batch_size=batch_size[0], max_length=max_length, p_uneven=p_uneven, swap_bitext=swap_bitext, allow_shuffle=True, is_infinite=True) if 'valid' in opts.train: logging.info('read Valid data') self.data_valid = DataSet(self.steps, opts.train['valid'], token, self.vocab, sim_run=self.sim_run, batch_size=batch_size[1], max_length=max_length, p_uneven=p_uneven, swap_bitext=swap_bitext, allow_shuffle=True, is_infinite=False) else: self.data_valid = None
# encoding=utf-8 """ Created on 2016年4月18日 @author: lenovo """ from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB from src.dataset import DataSet # 贝叶斯分类器对象 for classifier in [BernoulliNB(), MultinomialNB(), GaussianNB()]: print("classifier: [%s]" % type(classifier).__name__) # 数据类对象 data = DataSet() # 获取带标签训练数据 train_X = data.get_train_data() train_Y = data.get_tag() # 训练 print("start training") classifier.fit(train_X, train_Y) print("training done") # 获取向量化测试数据 test_X = data.get_test_data() # 预测结果 print("start predicating") result = classifier.predict(test_X) print("predicate done")
import multiprocessing import os os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1' # 0, 1, 2, 3 # Disable Tensortflow Logging os.chdir( os.path.dirname( os.path.abspath(__file__) ) ) import tensorflow.keras as keras import time from src.dataset import DataSet from src.examples.tensorflow import FunctionalCNN, SequentialCNN, ClassCNN, ClassNN from src.utils.csv import predict_to_csv timer_start = time.time() dataset = DataSet() config = { "verbose": False, "epochs": 12, "batch_size": 128, "input_shape": dataset.input_shape(), "output_shape": dataset.output_shape(), } print("config", config) # BUG: ClassCNN accuracy is only 36% compared to 75% for SequentialCNN / FunctionalCNN # SequentialCNN validation: | loss: 1.3756675141198293 | accuracy: 0.7430952 # FunctionalCNN validation: | loss: 1.4285654685610816 | accuracy: 0.7835714 # ClassCNN validation: | loss: 1.9851970995040167 | accuracy: 0.36214286 # ClassNN validation: | loss: 2.302224604288737 | accuracy: 0.09059524 models = {
class TestDataSet(TestCase): def setUp(self): self.init_column_names = ["feat_001", "X-coord", "h,std,dev", "Grade"] self.init_features = [[0., 0.1, 3.], [0., 0.2, 0.], [0., 0.3, 0.5]] self.init_classes = ["G3", "G1", "G1"] self.expected_extracted_column_names = self.init_column_names self.expected_extracted_features = [[1.7, 3., 0.09], [-5., -1.12, 0.]] self.expected_extracted_classes = ["G2", "G3"] self.data_set_dir = path.join(path.dirname(__file__), "data_sets") self.data_set = DataSet(X=self.init_features, y=self.init_classes, col_names=self.init_column_names) def check_extracted(self): self.assertListEqual(self.expected_extracted_column_names, self.data_set.col_names) self.assertListEqual(self.expected_extracted_features, self.data_set.X) self.assertListEqual(self.expected_extracted_classes, self.data_set.y) def test_should_initialize_properly(self): self.assertListEqual(self.init_column_names, self.data_set.col_names) self.assertListEqual(self.init_features, self.data_set.X) self.assertListEqual(self.init_classes, self.data_set.y) def test_should_extract_features_and_classes_from_csv_with_header(self): with open(path.join(DATA_SETS_DIR, "mock_data_set_with_header.csv"), "r", newline='', encoding="utf8") as csv_file: self.data_set.extract_from_csv(csv_file) self.check_extracted() def test_should_raise_error_on_feature_size_mismatch(self): with self.assertRaises(RuntimeError) as cm: DataSet(X=[[1], [1, 2]]) with self.assertRaises(RuntimeError): DataSet(y=[1]) with self.assertRaises(RuntimeError): DataSet(X=[[1], [1, 2]], y=[1]) with self.assertRaises(RuntimeError): DataSet(X=[[1], [1, 2]], y=[1, 1]) with self.assertRaises(RuntimeError): DataSet(X=[[1], [1, 2]], y=[1, 1, 1]) with self.assertRaises(RuntimeError): DataSet(col_names=["a"]) with self.assertRaises(RuntimeError): DataSet(X=[[1, 2], [1, 2]], y=[1, 1], col_names=["a"]) with open(path.join(DATA_SETS_DIR, "mock_data_set_corrupted.csv"), "r", newline='', encoding="utf8") as csv_file: with self.assertRaises(RuntimeError): self.data_set.extract_from_csv(csv_file) def test_should_return_number_of_features(self): self.assertEqual(len(self.init_features[0]), self.data_set.number_of_features) def test_should_return_column_name(self): for index, element in enumerate(self.init_column_names): self.assertEqual(element, self.data_set.col_names[index]) def test_should_create_ranking(self): ranking = [2, 1, 0] self.assertListEqual( self.data_set.create_features_ranking(use_names=False), ranking)