Пример #1
0
def started():

    if __name__ == '__main__':
        print("Ok let's go!")

        # Where to find data
        datasource_info = [('newyorktimes', 'data/nyt_discussions.json'),
                           ('motherjones', 'data/motherjones_discussions.json'),
                           ('breitbart', 'data/breitbart_discussions.json')]

        # Load the dataset into memory
        json_text = load_json_files(datasource_info, verbose=True)
        dataset = build_dataset(json_text, featurize_text, verbose=True)

        # Split our data into train and test
        train_dataset, test_dataset = split_dataset(dataset, fraction_train=0.8)

        # Train our classifier
        nb_classifier = NaiveBayesClassifier()
        nb_classifier.train(train_dataset)

        # Evaluate our classifier, for each class
        performance_string = 'Class {klass} performance: f1={f1:.{digits}}, precision={precision:.{digits}}, recall={recall:.{digits}}'
        for klass in sorted(nb_classifier.class_counter):  # sort just for nicer output
            f1, precision, recall = evaluate_classifier(nb_classifier, klass,
                                                        test_dataset)

            print(performance_string.format(klass=klass, f1=f1, precision=precision, recall=recall, digits=3))
    else:
        print("Ok let's go!")

        # Where to find data
        datasource_info = [('newyorktimes', 'data/nyt_discussions.json'),
                           ('motherjones', 'data/motherjones_discussions.json'),
                           ('breitbart', 'data/breitbart_discussions.json')]

        # Load the dataset into memory
        json_text = load_json_files(datasource_info, verbose=True)
        dataset = build_dataset(json_text, featurize_text, verbose=True)

        # Split our data into train and test
        train_dataset, test_dataset = split_dataset(dataset, fraction_train=0.8)

        # Train our classifier
        nb_classifier = NaiveBayesClassifier()
        nb_classifier.train(train_dataset)

        # Evaluate our classifier, for each class
        performance_string = 'Class {klass} performance: f1={f1:.{digits}}, precision={precision:.{digits}}, recall={recall:.{digits}}'
        for klass in sorted(nb_classifier.class_counter):  # sort just for nicer output
            f1, precision, recall = evaluate_classifier(nb_classifier, klass,
                                                        test_dataset)

            print(performance_string.format(klass=klass, f1=f1, precision=precision, recall=recall, digits=3))
def runExperiment():
    seed = int(cfg['model_tag'].split('_')[0])
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    dataset = fetch_dataset(cfg['data_name'], cfg['subset'])
    process_dataset(dataset)
    model = eval('models.{}(model_rate=cfg["global_model_rate"]).to(cfg["device"])'.format(cfg['model_name']))
    optimizer = make_optimizer(model, cfg['lr'])
    scheduler = make_scheduler(optimizer)
    if cfg['resume_mode'] == 1:
        last_epoch, data_split, label_split, model, optimizer, scheduler, logger = resume(model, cfg['model_tag'],
                                                                                          optimizer, scheduler)
    elif cfg['resume_mode'] == 2:
        last_epoch = 1
        _, data_split, label_split, model, _, _, _ = resume(model, cfg['model_tag'])
        current_time = datetime.datetime.now().strftime('%b%d_%H-%M-%S')
        logger_path = 'output/runs/{}_{}'.format(cfg['model_tag'], current_time)
        logger = Logger(logger_path)
    else:
        last_epoch = 1
        data_split, label_split = split_dataset(dataset, cfg['num_users'], cfg['data_split_mode'])
        current_time = datetime.datetime.now().strftime('%b%d_%H-%M-%S')
        logger_path = 'output/runs/train_{}_{}'.format(cfg['model_tag'], current_time)
        logger = Logger(logger_path)
    if data_split is None:
        data_split, label_split = split_dataset(dataset, cfg['num_users'], cfg['data_split_mode'])
    global_parameters = model.state_dict()
    federation = Federation(global_parameters, cfg['model_rate'], label_split)
    for epoch in range(last_epoch, cfg['num_epochs']['global'] + 1):
        logger.safe(True)
        train(dataset['train'], data_split['train'], label_split, federation, model, optimizer, logger, epoch)
        test_model = stats(dataset['train'], model)
        test(dataset['test'], data_split['test'], label_split, test_model, logger, epoch)
        if cfg['scheduler_name'] == 'ReduceLROnPlateau':
            scheduler.step(metrics=logger.mean['train/{}'.format(cfg['pivot_metric'])])
        else:
            scheduler.step()
        logger.safe(False)
        model_state_dict = model.state_dict()
        save_result = {
            'cfg': cfg, 'epoch': epoch + 1, 'data_split': data_split, 'label_split': label_split,
            'model_dict': model_state_dict, 'optimizer_dict': optimizer.state_dict(),
            'scheduler_dict': scheduler.state_dict(), 'logger': logger}
        save(save_result, './output/model/{}_checkpoint.pt'.format(cfg['model_tag']))
        if cfg['pivot'] < logger.mean['test/{}'.format(cfg['pivot_metric'])]:
            cfg['pivot'] = logger.mean['test/{}'.format(cfg['pivot_metric'])]
            shutil.copy('./output/model/{}_checkpoint.pt'.format(cfg['model_tag']),
                        './output/model/{}_best.pt'.format(cfg['model_tag']))
        logger.reset()
    logger.safe(False)
    return
Пример #3
0
    def init_data(self):
        print('Initialize dataset...')
        self.train_transform = data.get_transform(self.args.image_size,
                                                  self.args.train_transform)
        self.test_transform = data.get_transform(self.args.image_size,
                                                 self.args.test_transform)

        # load base dataset
        self.base_dataset, self.test_dataset = data.load_base_dataset(
            self.args)
        self.base_dataset.transform = self.train_transform
        self.test_dataset.transform = self.test_transform

        # split to train/val/pool set
        if self.args.init_size is None:
            self.train_idx = list(range(len(self.base_dataset)))
            self.val_idx = []
            self.pool_idx = []
            self.args.init_size = len(self.base_dataset)
            self.args.per_size = 0
            self.args.max_size = len(self.base_dataset)
        else:
            self.train_idx, self.val_idx, self.pool_idx = data.split_dataset(
                self.base_dataset, self.args.ny, self.args.init_size,
                self.args.val_size)

        if self.args.max_size is None:
            self.args.per_size = 0
            self.args.max_size = self.args.init_size

        # define trainset and pool
        self.trainset = data_utils.Subset(self.base_dataset, self.train_idx)
        self.valset = data_utils.Subset(self.base_dataset, self.val_idx)
        self.pool = data_utils.Subset(self.base_dataset, self.pool_idx)
def cv_naive_bayes(dataset):
    NBClassifier = GaussianNB()
    features_train, features_test, labels_train, labels_test = split_dataset(
        dataset=dataset,
        test_size=0.20,
        random_state=99
    )
    
    features_train, features_test = transform(X_train=features_train, X_test=features_test)

    result = performance_measurement_cv(algorithm=NBClassifier, features_train=features_train, labels_train=labels_train)
                                 
    return result
Пример #5
0
def cv_svm(dataset):
    SVMClassifier = SVC(kernel='linear')
    features_train, features_test, labels_train, labels_test = split_dataset(
        dataset=dataset, test_size=0.2, random_state=51)

    features_train, features_test = transform(X_train=features_train,
                                              X_test=features_test)

    result = performance_measurement_cv(algorithm=SVMClassifier,
                                        features_train=features_train,
                                        labels_train=labels_train)

    return result
Пример #6
0
def cv_mlp(dataset):
    mlpClassifier = MLPClassifier(hidden_layer_sizes=(100, ),
                                  max_iter=1000,
                                  activation='relu',
                                  solver='adam',
                                  random_state=1)
    features_train, features_test, labels_train, labels_test = split_dataset(
        dataset=dataset, test_size=0.20, random_state=0)
    result = performance_measurement_cv(algorithm=mlpClassifier,
                                        features_train=features_train,
                                        labels_train=labels_train)

    return result
Пример #7
0
def cv_random_forest(dataset):
    regressor = RandomForestClassifier(n_estimators=100)
    features_train, features_test, labels_train, labels_test = split_dataset(
        dataset=dataset, test_size=0.20, random_state=51)

    features_train, features_test = transform(X_train=features_train,
                                              X_test=features_test)

    result = performance_measurement_cv(algorithm=regressor,
                                        features_train=features_train,
                                        labels_train=labels_train)

    return result
Пример #8
0
def runExperiment(model_tag):
    seed = int(model_tag.split('_')[0])
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    config.PARAM['randomGen'] = np.random.RandomState(seed)
    dataset = {'test': fetch_dataset(data_name=config.PARAM['data_name']['test'])['test']}
    data_loader = split_dataset(dataset, data_size=config.PARAM['data_size'], batch_size=config.PARAM['batch_size'],
                                radomGen=config.PARAM['randomGen'])
    model = eval('models.{}().to(config.PARAM["device"])'.format(config.PARAM['model_name']))
    best = load('./output/model/{}_best.pkl'.format(model_tag))
    model.load_state_dict(best['model_dict'])
    result = test(data_loader['test'], model)
    save(result, './output/result/{}.pkl'.format(model_tag))
    return
Пример #9
0
def train():
    # load config file and prepare experiment
    args = get_args()
    config = process_config(args.config)
    create_dirs([config.model_dir, config.tensorboard_dir])

    # load dataset file
    dataset = load_pair_paths(config)

    # split dataset train and test
    train_pairs, test_pairs = split_dataset(config, dataset)

    if config.debug:
        print("WARNING!!! DEBUG MODE ON! 100 training.")
        train_pairs = train_pairs[:100]
        print(train_pairs)
        test_pairs = test_pairs[:100]
        print(test_pairs)

    # Calculate steps for each epoch
    train_num_steps = calculate_num_iter(config, train_pairs)
    test_num_steps = calculate_num_iter(config, test_pairs)


    # Create the model
    model = depth_model(config)

    #set dynamic output shape
    config.output_size = list(model.output_shape[1:])

    # Create train and test data generators
    train_gen = tf_data_generator(config, train_pairs, is_training=True)
    test_gen = tf_data_generator(config,test_pairs, is_training=False)

    # Prepare for training
    model.compile(optimizer=select_optimizer(config), loss=select_loss(config))


    model.fit(
        train_gen,
        steps_per_epoch=train_num_steps,
        epochs=config.num_epochs,
        callbacks=create_callbacks(config),
        validation_data=test_gen,
        validation_steps=test_num_steps,
        verbose=1)



    print("Training Done.")
Пример #10
0
def svm(dataset):
    SVMClassifier = SVC(kernel='linear')
    features_train, features_test, labels_train, labels_test = split_dataset(
        dataset=dataset, test_size=20, random_state=51)

    features_train, features_test = transform(X_train=features_train,
                                              X_test=features_test)

    SVMClassifier.fit(features_train, labels_train)
    labels_pred = SVMClassifier.predict(features_test)

    ac, kp, ps, rc, fm, mc, ra, pa, sp = performance_measurement(
        labels_test=labels_test, labels_pred=labels_pred, algorithm_name="SVM")
    return ac, kp, ps, rc, fm, mc, ra, pa, sp
Пример #11
0
def mlp(dataset):
    mlpClassifier = MLPClassifier(hidden_layer_sizes=(100, ),
                                  max_iter=1000,
                                  activation='relu',
                                  solver='adam',
                                  random_state=1)
    features_train, features_test, labels_train, labels_test = split_dataset(
        dataset=dataset, test_size=0.20, random_state=0)

    mlpClassifier.fit(features_train, labels_train)  #Training step
    labels_pred = mlpClassifier.predict(features_test)  #Testing step

    ac, kp, ps, rc, fm, mc, ra, pa, sp = performance_measurement(
        labels_test=labels_test, labels_pred=labels_pred, algorithm_name="MLP")
    return ac, kp, ps, rc, fm, mc, ra, pa, sp
Пример #12
0
def extract(dataset, config):

    model = depth_model(config)
    config.output_size = list(model.output_shape[1:])
    model.compile(optimizer=select_optimizer(config), loss=select_loss(config),
                  metrics=[mean_absolute_error, mean_squared_error,root_mean_squared_error,abs_relative,t_relative])
    model.load_weights(config.model_dir + config.prediction_model_name)

    # split dataset train and test
    train_pairs, test_pairs = split_dataset(config, dataset)
    test_num_steps = calculate_num_iter(config, test_pairs)
    test_gen = tf_data_generator(config, test_pairs, is_training=False)
    result = model.evaluate(test_gen, steps=test_num_steps, verbose=1)
    tf.keras.backend.clear_session()
    return model.metrics_names, result
Пример #13
0
def cv_naive_bayes(dataset, rd, cv, scoring, test_size):
    NBClassifier = GaussianNB()
    features_train, features_test, labels_train, labels_test = split_dataset(
        dataset=dataset, test_size=test_size, random_state=rd)

    features_train, features_test = transform(X_train=features_train,
                                              X_test=features_test)

    cv_results = cross_val_score(NBClassifier,
                                 features_train,
                                 labels_train,
                                 cv=cv,
                                 scoring=scoring)

    return cv_results.mean()
Пример #14
0
def random_forest(dataset):
    regressor = RandomForestClassifier(n_estimators=100)
    features_train, features_test, labels_train, labels_test = split_dataset(
        dataset=dataset, test_size=0.20, random_state=51)

    features_train, features_test = transform(X_train=features_train,
                                              X_test=features_test)

    regressor.fit(features_train, labels_train)
    labels_pred = regressor.predict(features_test)

    ac, kp, ps, rc, fm, mc, ra, pa, sp = performance_measurement(
        labels_test=labels_test,
        labels_pred=labels_pred,
        algorithm_name="RANDOM FOREST")
    return ac, kp, ps, rc, fm, mc, ra, pa, sp
def runExperiment(model_tag):
    model_tag_list = model_tag.split('_')
    seed = int(model_tag_list[0])
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    randomGen = np.random.RandomState(seed)
    dataset = {
        'test':
        fetch_dataset(data_name=config.PARAM['data_name']['test'])['test']
    }
    data_loader = split_dataset(dataset,
                                data_size=config.PARAM['data_size'],
                                batch_size=config.PARAM['batch_size'],
                                radomGen=randomGen)
    model = eval('models.{}().to(device)'.format(config.PARAM['model_name']))
    logger = Logger('runs/{}'.format(model_tag))
    print(config.PARAM)
    test(data_loader['test'], model, logger)
    return
def naive_bayes(dataset, test_size):
    NBClassifier = GaussianNB()
    features_train, features_test, labels_train, labels_test = split_dataset(
        dataset=dataset,
        test_size=test_size,
        random_state=51
    )
    
    features_train, features_test = transform(X_train=features_train, X_test=features_test)

    NBClassifier.fit(features_train, labels_train)           #Training step
    labels_pred  =  NBClassifier.predict(features_test)      #Testing step

    ac, kp, ps, rc, fm, mc, ra, pa, sp = performance_measurement(
                                            labels_test=labels_test, 
                                            labels_pred=labels_pred,
                                            algorithm_name="NAIVE BAYES"
                                        )
    return ac, kp, ps, rc, fm, mc, ra, pa, sp
Пример #17
0
    def forward(self, x):
        h = self.emb(x)
        h = h.mean(dim=1)
        h = self.activation(self.linear(h))
        h = self.linear_out(h)
        p = F.log_softmax(h, dim=-1)
        return p


if __name__ == '__main__':

    from data import load_ted_data, split_dataset, TedDataset
    from torch.utils.data import DataLoader

    tokens_ted, labels = load_ted_data('ted_en-20160408.xml')
    tokens_train, tokens_dev, tokens_test = split_dataset(tokens_ted)
    labels_train, labels_dev, labels_test = split_dataset(labels)
    train_dataset = TedDataset(tokens_train, labels_train, min_frequency=10)

    train_dataloader = DataLoader(train_dataset,
                                  collate_fn=train_dataset.collate_fn,
                                  batch_size=3,
                                  num_workers=4)

    config = {
        'model_folder': 'tmp',
        'embedding_size': 64,
        'hidden_size': 20,
    }

    mlp = MLP(config)
Пример #18
0
import pdb

if __name__ == '__main__':
    print("Ok let's go!")

    # Where to find data
    datasource_info = [('newyorktimes', 'data/nyt_discussions.json'),
                       ('motherjones', 'data/motherjones_discussions.json'),
                       ('breitbart', 'data/breitbart_discussions.json')]

    # Load the dataset into memory
    json_text = load_json_files(datasource_info, verbose=True)
    dataset = build_dataset(json_text, featurize_text, verbose=True)

    # Split our data into train and test
    train_dataset, test_dataset = split_dataset(dataset, fraction_train=0.8)

    # Train our classifier
    nb_classifier = NaiveBayesClassifier()
    nb_classifier.train(train_dataset)

    #pdb.set_trace()

    # Evaluate our classifier, for each class
    performance_string = 'Class {klass} performance: f1={f1:.{digits}}, precision={precision:.{digits}}, recall={recall:.{digits}}'
    for klass in sorted(
            nb_classifier.class_counter):  # sort just for nicer output
        f1, precision, recall = evaluate_classifier(nb_classifier, klass,
                                                    test_dataset)

        print(
Пример #19
0
import torch
import torch.nn as nn
import data
import models
# from utils import train, test

# Global variables
PATH_TO_DATA = "dataset/preprocessed_data.csv"
random_seed = 42
on_gpu = False

# Setting the seed
torch.manual_seed(random_seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(random_seed)
    on_gpu = True

# Parameters
params = {'epochs': 10, 'batch_size': 32}

# Load data
dataset = data.import_dataset(PATH_TO_DATA)
train_set, test_set, val_set = data.split_dataset(dataset)
train_loader, test_loader, val_loader = data.get_dataloaders(
    (train_set, test_set, val_set), batch_size=params['batch_size'])
loaders = {'train': train_loader, 'val': val_loader, 'test': test_loader}
Пример #20
0
def main(model=None):

    print(f'readying model & data @ {now()}')

    data = load_data()
    if not data:
        save_data(preprocess())
        data = load_data()

    if not model:
        if not config.fresh_model:
            model = load_model()
        if not model:
            model = make_model()
            save_model(model)
            model = load_model()
            print('created ',end='')
        else: print('loaded ',end='')
        print(f'model: {describe_model(model)}')

    print(f'total files: {len(data)}, ',end='')

    data, data_dev = split_dataset(data)

    if config.batch_size > len(data):
        config.batch_size = len(data)
    elif config.batch_size == -1:
        config.batch_size = len(data_dev)

    print(f'train: {len(data)}, dev: {len(data_dev)}, batch size: {config.batch_size}')

    print(f'hm train: {sum(len(datapoint) for datapoint in data)}, '
          f'hm dev: {sum(len(datapoint) for datapoint in data_dev)}, '
          f'learning rate: {config.learning_rate}, '
          f'optimizer: {config.optimizer}, '
          f'\ntraining for {config.hm_epochs} epochs.. ',end='\n')

    one_batch = (config.batch_size == len(data)) or (config.train_combined and config.train_parallel)
    config.shuffle_epoch &= not one_batch
    window_slide_multiplier = config.hm_bars_grouped//config.hm_bars_slide
    if config.ckp_save_epochs == -1: config.ckp_save_epochs = range(config.hm_epochs)

    data_losss, dev_losss = [], []

    if config.initialize_loss:

        print(f'initializing losses @ {now()}', flush=True)
        if not one_batch:
            data_losss.append(dev_loss(model,data))
        dev_losss.append(dev_loss(model,data_dev))
        print(f'initial losses: {data_losss, dev_losss}')

    print(f'training started @ {now()}', flush=True)

    for ep in range(config.hm_epochs):

        loss = 0

        if config.train_parallel and config.train_combined:
            l, g = process_data_onebatch(model, data)
            loss += l
            give_grads(model, g)
            batch_size = sum(sum(len(inp) * window_slide_multiplier for inp, lbl in datapoint) for datapoint in data)
            sgd(model, batch_size=batch_size) if config.optimizer == 'sgd' else adaptive_sgd(model, ep, batch_size=batch_size)

        else:
            for i,batch in enumerate(batchify(data)):

                if config.disp_batches:
                    print(f'\tbatch {i}, {sum(len(datapoint) for datapoint in batch)}', end='', flush=True)

                batch_size = sum(sum(len(inp)*window_slide_multiplier for inp,lbl in datapoint) for datapoint in batch)

                if config.train_parallel:
                    l,g = process_batch_parallel(model,batch)
                    loss += l
                    give_grads(model,g)

                elif config.train_combined:
                    loss += process_batch_combined(model, batch)

                else:
                    for j,datapoint in enumerate(batch):
                        states = None
                        for k,(inp,lbl) in enumerate(datapoint):
                            out, states = respond_to(model, inp, states)
                            states = [state.detach() for state in states]
                            loss += sequence_loss(lbl,out)

                sgd(model,batch_size=batch_size) if config.optimizer == 'sgd' else adaptive_sgd(model,ep,batch_size=batch_size)

                if config.disp_batches:
                    print(f', completed @ {now()}' ,flush=True)

        loss /= sum(sum(len(inp)*window_slide_multiplier for inp,lbl in datapoint) for datapoint in data)

        data_losss.append(loss)
        dev_losss.append(dev_loss(model,data_dev))
        
        print(f'epoch {ep}, loss {loss}, dev loss {dev_losss[-1]}, completed @ {now()}', flush=True)

        if ep in config.ckp_save_epochs:
            save_model(model,f'{config.model_save_path}_ckp{ep}')

    data_losss.append(dev_loss(model,data))
    dev_losss.append(dev_loss(model,data_dev))

    print(f'final losses: {[data_losss[-1],dev_losss[-1]]}')

    print(f'training ended @ {now()}', flush=True)

    plot(data_losss)
    show()
    plot(dev_losss)
    show()

    if config.overwrite_model or input(f'Save model as {config.model_save_path}? (y/n): ').lower() == 'y':
        save_model(load_model(),config.model_save_path+'_prev')
        save_model(model)

    return model, [data_losss, dev_losss]
Пример #21
0
def run():

    # Config
    config = {
        'model_folder': 'tmp',
        'embedding_size': 50,
        'hidden_size': 25,
        'batch_size': 50,
        'epochs': 100
    }

    # Data
    tokens_ted, labels = load_ted_data('ted_en-20160408.xml')
    tokens_train, tokens_dev, tokens_test = split_dataset(tokens_ted)
    labels_train, labels_dev, labels_test = split_dataset(labels)

    train_dataset = TedDataset(tokens_train, labels_train, min_frequency=10)

    dev_dataset = TedDataset(tokens_dev,
                             labels_dev,
                             vocabulary=train_dataset.vocabulary,
                             raw_output=True)

    test_dataset = TedDataset(tokens_test,
                              labels_test,
                              vocabulary=train_dataset.vocabulary,
                              raw_output=True)

    train_dataloader = DataLoader(train_dataset,
                                  collate_fn=train_dataset.collate_fn,
                                  batch_size=config['batch_size'],
                                  num_workers=4)

    dev_dataloader = DataLoader(dev_dataset,
                                collate_fn=train_dataset.collate_fn,
                                batch_size=config['batch_size'],
                                num_workers=4)

    test_dataloader = DataLoader(test_dataset,
                                 collate_fn=train_dataset.collate_fn,
                                 batch_size=config['batch_size'],
                                 num_workers=4)

    # Model
    model = MLP(config)
    model.initialize_features(data=train_dataset)
    model.build_model()

    # Logger
    logger = BasicLogger(metric=accuracy_score, score_optimization='max')

    # Trainer
    trainer = Trainer(model=model, logger=logger)
    trainer.fit(train_dataloader, dev_dataloader, epochs=config['epochs'])

    model.load('{}/{}.torch'.format(model.config['model_folder'],
                                    type(model).__name__.lower()))

    target = []
    for batch in test_dataloader:
        target.extend(batch['output'].tolist())
    predictions = trainer.test(test_dataloader)
    print("Test Accuracy:", accuracy_score(target, predictions))