def started(): if __name__ == '__main__': print("Ok let's go!") # Where to find data datasource_info = [('newyorktimes', 'data/nyt_discussions.json'), ('motherjones', 'data/motherjones_discussions.json'), ('breitbart', 'data/breitbart_discussions.json')] # Load the dataset into memory json_text = load_json_files(datasource_info, verbose=True) dataset = build_dataset(json_text, featurize_text, verbose=True) # Split our data into train and test train_dataset, test_dataset = split_dataset(dataset, fraction_train=0.8) # Train our classifier nb_classifier = NaiveBayesClassifier() nb_classifier.train(train_dataset) # Evaluate our classifier, for each class performance_string = 'Class {klass} performance: f1={f1:.{digits}}, precision={precision:.{digits}}, recall={recall:.{digits}}' for klass in sorted(nb_classifier.class_counter): # sort just for nicer output f1, precision, recall = evaluate_classifier(nb_classifier, klass, test_dataset) print(performance_string.format(klass=klass, f1=f1, precision=precision, recall=recall, digits=3)) else: print("Ok let's go!") # Where to find data datasource_info = [('newyorktimes', 'data/nyt_discussions.json'), ('motherjones', 'data/motherjones_discussions.json'), ('breitbart', 'data/breitbart_discussions.json')] # Load the dataset into memory json_text = load_json_files(datasource_info, verbose=True) dataset = build_dataset(json_text, featurize_text, verbose=True) # Split our data into train and test train_dataset, test_dataset = split_dataset(dataset, fraction_train=0.8) # Train our classifier nb_classifier = NaiveBayesClassifier() nb_classifier.train(train_dataset) # Evaluate our classifier, for each class performance_string = 'Class {klass} performance: f1={f1:.{digits}}, precision={precision:.{digits}}, recall={recall:.{digits}}' for klass in sorted(nb_classifier.class_counter): # sort just for nicer output f1, precision, recall = evaluate_classifier(nb_classifier, klass, test_dataset) print(performance_string.format(klass=klass, f1=f1, precision=precision, recall=recall, digits=3))
def runExperiment(): seed = int(cfg['model_tag'].split('_')[0]) torch.manual_seed(seed) torch.cuda.manual_seed(seed) dataset = fetch_dataset(cfg['data_name'], cfg['subset']) process_dataset(dataset) model = eval('models.{}(model_rate=cfg["global_model_rate"]).to(cfg["device"])'.format(cfg['model_name'])) optimizer = make_optimizer(model, cfg['lr']) scheduler = make_scheduler(optimizer) if cfg['resume_mode'] == 1: last_epoch, data_split, label_split, model, optimizer, scheduler, logger = resume(model, cfg['model_tag'], optimizer, scheduler) elif cfg['resume_mode'] == 2: last_epoch = 1 _, data_split, label_split, model, _, _, _ = resume(model, cfg['model_tag']) current_time = datetime.datetime.now().strftime('%b%d_%H-%M-%S') logger_path = 'output/runs/{}_{}'.format(cfg['model_tag'], current_time) logger = Logger(logger_path) else: last_epoch = 1 data_split, label_split = split_dataset(dataset, cfg['num_users'], cfg['data_split_mode']) current_time = datetime.datetime.now().strftime('%b%d_%H-%M-%S') logger_path = 'output/runs/train_{}_{}'.format(cfg['model_tag'], current_time) logger = Logger(logger_path) if data_split is None: data_split, label_split = split_dataset(dataset, cfg['num_users'], cfg['data_split_mode']) global_parameters = model.state_dict() federation = Federation(global_parameters, cfg['model_rate'], label_split) for epoch in range(last_epoch, cfg['num_epochs']['global'] + 1): logger.safe(True) train(dataset['train'], data_split['train'], label_split, federation, model, optimizer, logger, epoch) test_model = stats(dataset['train'], model) test(dataset['test'], data_split['test'], label_split, test_model, logger, epoch) if cfg['scheduler_name'] == 'ReduceLROnPlateau': scheduler.step(metrics=logger.mean['train/{}'.format(cfg['pivot_metric'])]) else: scheduler.step() logger.safe(False) model_state_dict = model.state_dict() save_result = { 'cfg': cfg, 'epoch': epoch + 1, 'data_split': data_split, 'label_split': label_split, 'model_dict': model_state_dict, 'optimizer_dict': optimizer.state_dict(), 'scheduler_dict': scheduler.state_dict(), 'logger': logger} save(save_result, './output/model/{}_checkpoint.pt'.format(cfg['model_tag'])) if cfg['pivot'] < logger.mean['test/{}'.format(cfg['pivot_metric'])]: cfg['pivot'] = logger.mean['test/{}'.format(cfg['pivot_metric'])] shutil.copy('./output/model/{}_checkpoint.pt'.format(cfg['model_tag']), './output/model/{}_best.pt'.format(cfg['model_tag'])) logger.reset() logger.safe(False) return
def init_data(self): print('Initialize dataset...') self.train_transform = data.get_transform(self.args.image_size, self.args.train_transform) self.test_transform = data.get_transform(self.args.image_size, self.args.test_transform) # load base dataset self.base_dataset, self.test_dataset = data.load_base_dataset( self.args) self.base_dataset.transform = self.train_transform self.test_dataset.transform = self.test_transform # split to train/val/pool set if self.args.init_size is None: self.train_idx = list(range(len(self.base_dataset))) self.val_idx = [] self.pool_idx = [] self.args.init_size = len(self.base_dataset) self.args.per_size = 0 self.args.max_size = len(self.base_dataset) else: self.train_idx, self.val_idx, self.pool_idx = data.split_dataset( self.base_dataset, self.args.ny, self.args.init_size, self.args.val_size) if self.args.max_size is None: self.args.per_size = 0 self.args.max_size = self.args.init_size # define trainset and pool self.trainset = data_utils.Subset(self.base_dataset, self.train_idx) self.valset = data_utils.Subset(self.base_dataset, self.val_idx) self.pool = data_utils.Subset(self.base_dataset, self.pool_idx)
def cv_naive_bayes(dataset): NBClassifier = GaussianNB() features_train, features_test, labels_train, labels_test = split_dataset( dataset=dataset, test_size=0.20, random_state=99 ) features_train, features_test = transform(X_train=features_train, X_test=features_test) result = performance_measurement_cv(algorithm=NBClassifier, features_train=features_train, labels_train=labels_train) return result
def cv_svm(dataset): SVMClassifier = SVC(kernel='linear') features_train, features_test, labels_train, labels_test = split_dataset( dataset=dataset, test_size=0.2, random_state=51) features_train, features_test = transform(X_train=features_train, X_test=features_test) result = performance_measurement_cv(algorithm=SVMClassifier, features_train=features_train, labels_train=labels_train) return result
def cv_mlp(dataset): mlpClassifier = MLPClassifier(hidden_layer_sizes=(100, ), max_iter=1000, activation='relu', solver='adam', random_state=1) features_train, features_test, labels_train, labels_test = split_dataset( dataset=dataset, test_size=0.20, random_state=0) result = performance_measurement_cv(algorithm=mlpClassifier, features_train=features_train, labels_train=labels_train) return result
def cv_random_forest(dataset): regressor = RandomForestClassifier(n_estimators=100) features_train, features_test, labels_train, labels_test = split_dataset( dataset=dataset, test_size=0.20, random_state=51) features_train, features_test = transform(X_train=features_train, X_test=features_test) result = performance_measurement_cv(algorithm=regressor, features_train=features_train, labels_train=labels_train) return result
def runExperiment(model_tag): seed = int(model_tag.split('_')[0]) torch.manual_seed(seed) torch.cuda.manual_seed(seed) config.PARAM['randomGen'] = np.random.RandomState(seed) dataset = {'test': fetch_dataset(data_name=config.PARAM['data_name']['test'])['test']} data_loader = split_dataset(dataset, data_size=config.PARAM['data_size'], batch_size=config.PARAM['batch_size'], radomGen=config.PARAM['randomGen']) model = eval('models.{}().to(config.PARAM["device"])'.format(config.PARAM['model_name'])) best = load('./output/model/{}_best.pkl'.format(model_tag)) model.load_state_dict(best['model_dict']) result = test(data_loader['test'], model) save(result, './output/result/{}.pkl'.format(model_tag)) return
def train(): # load config file and prepare experiment args = get_args() config = process_config(args.config) create_dirs([config.model_dir, config.tensorboard_dir]) # load dataset file dataset = load_pair_paths(config) # split dataset train and test train_pairs, test_pairs = split_dataset(config, dataset) if config.debug: print("WARNING!!! DEBUG MODE ON! 100 training.") train_pairs = train_pairs[:100] print(train_pairs) test_pairs = test_pairs[:100] print(test_pairs) # Calculate steps for each epoch train_num_steps = calculate_num_iter(config, train_pairs) test_num_steps = calculate_num_iter(config, test_pairs) # Create the model model = depth_model(config) #set dynamic output shape config.output_size = list(model.output_shape[1:]) # Create train and test data generators train_gen = tf_data_generator(config, train_pairs, is_training=True) test_gen = tf_data_generator(config,test_pairs, is_training=False) # Prepare for training model.compile(optimizer=select_optimizer(config), loss=select_loss(config)) model.fit( train_gen, steps_per_epoch=train_num_steps, epochs=config.num_epochs, callbacks=create_callbacks(config), validation_data=test_gen, validation_steps=test_num_steps, verbose=1) print("Training Done.")
def svm(dataset): SVMClassifier = SVC(kernel='linear') features_train, features_test, labels_train, labels_test = split_dataset( dataset=dataset, test_size=20, random_state=51) features_train, features_test = transform(X_train=features_train, X_test=features_test) SVMClassifier.fit(features_train, labels_train) labels_pred = SVMClassifier.predict(features_test) ac, kp, ps, rc, fm, mc, ra, pa, sp = performance_measurement( labels_test=labels_test, labels_pred=labels_pred, algorithm_name="SVM") return ac, kp, ps, rc, fm, mc, ra, pa, sp
def mlp(dataset): mlpClassifier = MLPClassifier(hidden_layer_sizes=(100, ), max_iter=1000, activation='relu', solver='adam', random_state=1) features_train, features_test, labels_train, labels_test = split_dataset( dataset=dataset, test_size=0.20, random_state=0) mlpClassifier.fit(features_train, labels_train) #Training step labels_pred = mlpClassifier.predict(features_test) #Testing step ac, kp, ps, rc, fm, mc, ra, pa, sp = performance_measurement( labels_test=labels_test, labels_pred=labels_pred, algorithm_name="MLP") return ac, kp, ps, rc, fm, mc, ra, pa, sp
def extract(dataset, config): model = depth_model(config) config.output_size = list(model.output_shape[1:]) model.compile(optimizer=select_optimizer(config), loss=select_loss(config), metrics=[mean_absolute_error, mean_squared_error,root_mean_squared_error,abs_relative,t_relative]) model.load_weights(config.model_dir + config.prediction_model_name) # split dataset train and test train_pairs, test_pairs = split_dataset(config, dataset) test_num_steps = calculate_num_iter(config, test_pairs) test_gen = tf_data_generator(config, test_pairs, is_training=False) result = model.evaluate(test_gen, steps=test_num_steps, verbose=1) tf.keras.backend.clear_session() return model.metrics_names, result
def cv_naive_bayes(dataset, rd, cv, scoring, test_size): NBClassifier = GaussianNB() features_train, features_test, labels_train, labels_test = split_dataset( dataset=dataset, test_size=test_size, random_state=rd) features_train, features_test = transform(X_train=features_train, X_test=features_test) cv_results = cross_val_score(NBClassifier, features_train, labels_train, cv=cv, scoring=scoring) return cv_results.mean()
def random_forest(dataset): regressor = RandomForestClassifier(n_estimators=100) features_train, features_test, labels_train, labels_test = split_dataset( dataset=dataset, test_size=0.20, random_state=51) features_train, features_test = transform(X_train=features_train, X_test=features_test) regressor.fit(features_train, labels_train) labels_pred = regressor.predict(features_test) ac, kp, ps, rc, fm, mc, ra, pa, sp = performance_measurement( labels_test=labels_test, labels_pred=labels_pred, algorithm_name="RANDOM FOREST") return ac, kp, ps, rc, fm, mc, ra, pa, sp
def runExperiment(model_tag): model_tag_list = model_tag.split('_') seed = int(model_tag_list[0]) torch.manual_seed(seed) torch.cuda.manual_seed(seed) randomGen = np.random.RandomState(seed) dataset = { 'test': fetch_dataset(data_name=config.PARAM['data_name']['test'])['test'] } data_loader = split_dataset(dataset, data_size=config.PARAM['data_size'], batch_size=config.PARAM['batch_size'], radomGen=randomGen) model = eval('models.{}().to(device)'.format(config.PARAM['model_name'])) logger = Logger('runs/{}'.format(model_tag)) print(config.PARAM) test(data_loader['test'], model, logger) return
def naive_bayes(dataset, test_size): NBClassifier = GaussianNB() features_train, features_test, labels_train, labels_test = split_dataset( dataset=dataset, test_size=test_size, random_state=51 ) features_train, features_test = transform(X_train=features_train, X_test=features_test) NBClassifier.fit(features_train, labels_train) #Training step labels_pred = NBClassifier.predict(features_test) #Testing step ac, kp, ps, rc, fm, mc, ra, pa, sp = performance_measurement( labels_test=labels_test, labels_pred=labels_pred, algorithm_name="NAIVE BAYES" ) return ac, kp, ps, rc, fm, mc, ra, pa, sp
def forward(self, x): h = self.emb(x) h = h.mean(dim=1) h = self.activation(self.linear(h)) h = self.linear_out(h) p = F.log_softmax(h, dim=-1) return p if __name__ == '__main__': from data import load_ted_data, split_dataset, TedDataset from torch.utils.data import DataLoader tokens_ted, labels = load_ted_data('ted_en-20160408.xml') tokens_train, tokens_dev, tokens_test = split_dataset(tokens_ted) labels_train, labels_dev, labels_test = split_dataset(labels) train_dataset = TedDataset(tokens_train, labels_train, min_frequency=10) train_dataloader = DataLoader(train_dataset, collate_fn=train_dataset.collate_fn, batch_size=3, num_workers=4) config = { 'model_folder': 'tmp', 'embedding_size': 64, 'hidden_size': 20, } mlp = MLP(config)
import pdb if __name__ == '__main__': print("Ok let's go!") # Where to find data datasource_info = [('newyorktimes', 'data/nyt_discussions.json'), ('motherjones', 'data/motherjones_discussions.json'), ('breitbart', 'data/breitbart_discussions.json')] # Load the dataset into memory json_text = load_json_files(datasource_info, verbose=True) dataset = build_dataset(json_text, featurize_text, verbose=True) # Split our data into train and test train_dataset, test_dataset = split_dataset(dataset, fraction_train=0.8) # Train our classifier nb_classifier = NaiveBayesClassifier() nb_classifier.train(train_dataset) #pdb.set_trace() # Evaluate our classifier, for each class performance_string = 'Class {klass} performance: f1={f1:.{digits}}, precision={precision:.{digits}}, recall={recall:.{digits}}' for klass in sorted( nb_classifier.class_counter): # sort just for nicer output f1, precision, recall = evaluate_classifier(nb_classifier, klass, test_dataset) print(
import torch import torch.nn as nn import data import models # from utils import train, test # Global variables PATH_TO_DATA = "dataset/preprocessed_data.csv" random_seed = 42 on_gpu = False # Setting the seed torch.manual_seed(random_seed) if torch.cuda.is_available(): torch.cuda.manual_seed(random_seed) on_gpu = True # Parameters params = {'epochs': 10, 'batch_size': 32} # Load data dataset = data.import_dataset(PATH_TO_DATA) train_set, test_set, val_set = data.split_dataset(dataset) train_loader, test_loader, val_loader = data.get_dataloaders( (train_set, test_set, val_set), batch_size=params['batch_size']) loaders = {'train': train_loader, 'val': val_loader, 'test': test_loader}
def main(model=None): print(f'readying model & data @ {now()}') data = load_data() if not data: save_data(preprocess()) data = load_data() if not model: if not config.fresh_model: model = load_model() if not model: model = make_model() save_model(model) model = load_model() print('created ',end='') else: print('loaded ',end='') print(f'model: {describe_model(model)}') print(f'total files: {len(data)}, ',end='') data, data_dev = split_dataset(data) if config.batch_size > len(data): config.batch_size = len(data) elif config.batch_size == -1: config.batch_size = len(data_dev) print(f'train: {len(data)}, dev: {len(data_dev)}, batch size: {config.batch_size}') print(f'hm train: {sum(len(datapoint) for datapoint in data)}, ' f'hm dev: {sum(len(datapoint) for datapoint in data_dev)}, ' f'learning rate: {config.learning_rate}, ' f'optimizer: {config.optimizer}, ' f'\ntraining for {config.hm_epochs} epochs.. ',end='\n') one_batch = (config.batch_size == len(data)) or (config.train_combined and config.train_parallel) config.shuffle_epoch &= not one_batch window_slide_multiplier = config.hm_bars_grouped//config.hm_bars_slide if config.ckp_save_epochs == -1: config.ckp_save_epochs = range(config.hm_epochs) data_losss, dev_losss = [], [] if config.initialize_loss: print(f'initializing losses @ {now()}', flush=True) if not one_batch: data_losss.append(dev_loss(model,data)) dev_losss.append(dev_loss(model,data_dev)) print(f'initial losses: {data_losss, dev_losss}') print(f'training started @ {now()}', flush=True) for ep in range(config.hm_epochs): loss = 0 if config.train_parallel and config.train_combined: l, g = process_data_onebatch(model, data) loss += l give_grads(model, g) batch_size = sum(sum(len(inp) * window_slide_multiplier for inp, lbl in datapoint) for datapoint in data) sgd(model, batch_size=batch_size) if config.optimizer == 'sgd' else adaptive_sgd(model, ep, batch_size=batch_size) else: for i,batch in enumerate(batchify(data)): if config.disp_batches: print(f'\tbatch {i}, {sum(len(datapoint) for datapoint in batch)}', end='', flush=True) batch_size = sum(sum(len(inp)*window_slide_multiplier for inp,lbl in datapoint) for datapoint in batch) if config.train_parallel: l,g = process_batch_parallel(model,batch) loss += l give_grads(model,g) elif config.train_combined: loss += process_batch_combined(model, batch) else: for j,datapoint in enumerate(batch): states = None for k,(inp,lbl) in enumerate(datapoint): out, states = respond_to(model, inp, states) states = [state.detach() for state in states] loss += sequence_loss(lbl,out) sgd(model,batch_size=batch_size) if config.optimizer == 'sgd' else adaptive_sgd(model,ep,batch_size=batch_size) if config.disp_batches: print(f', completed @ {now()}' ,flush=True) loss /= sum(sum(len(inp)*window_slide_multiplier for inp,lbl in datapoint) for datapoint in data) data_losss.append(loss) dev_losss.append(dev_loss(model,data_dev)) print(f'epoch {ep}, loss {loss}, dev loss {dev_losss[-1]}, completed @ {now()}', flush=True) if ep in config.ckp_save_epochs: save_model(model,f'{config.model_save_path}_ckp{ep}') data_losss.append(dev_loss(model,data)) dev_losss.append(dev_loss(model,data_dev)) print(f'final losses: {[data_losss[-1],dev_losss[-1]]}') print(f'training ended @ {now()}', flush=True) plot(data_losss) show() plot(dev_losss) show() if config.overwrite_model or input(f'Save model as {config.model_save_path}? (y/n): ').lower() == 'y': save_model(load_model(),config.model_save_path+'_prev') save_model(model) return model, [data_losss, dev_losss]
def run(): # Config config = { 'model_folder': 'tmp', 'embedding_size': 50, 'hidden_size': 25, 'batch_size': 50, 'epochs': 100 } # Data tokens_ted, labels = load_ted_data('ted_en-20160408.xml') tokens_train, tokens_dev, tokens_test = split_dataset(tokens_ted) labels_train, labels_dev, labels_test = split_dataset(labels) train_dataset = TedDataset(tokens_train, labels_train, min_frequency=10) dev_dataset = TedDataset(tokens_dev, labels_dev, vocabulary=train_dataset.vocabulary, raw_output=True) test_dataset = TedDataset(tokens_test, labels_test, vocabulary=train_dataset.vocabulary, raw_output=True) train_dataloader = DataLoader(train_dataset, collate_fn=train_dataset.collate_fn, batch_size=config['batch_size'], num_workers=4) dev_dataloader = DataLoader(dev_dataset, collate_fn=train_dataset.collate_fn, batch_size=config['batch_size'], num_workers=4) test_dataloader = DataLoader(test_dataset, collate_fn=train_dataset.collate_fn, batch_size=config['batch_size'], num_workers=4) # Model model = MLP(config) model.initialize_features(data=train_dataset) model.build_model() # Logger logger = BasicLogger(metric=accuracy_score, score_optimization='max') # Trainer trainer = Trainer(model=model, logger=logger) trainer.fit(train_dataloader, dev_dataloader, epochs=config['epochs']) model.load('{}/{}.torch'.format(model.config['model_folder'], type(model).__name__.lower())) target = [] for batch in test_dataloader: target.extend(batch['output'].tolist()) predictions = trainer.test(test_dataloader) print("Test Accuracy:", accuracy_score(target, predictions))