Exemplo n.º 1
0
	def __init__(self, mode = 'chat'):
		if not os.path.isdir(config.PROCESSED_PATH):
			data.prepare_raw_data()
			data.process_data()

		# create checkpoints folder if there isn't one already
		data.make_dir(config.CPT_PATH)

		if(mode == "chat"):
			self.__chat_init()
Exemplo n.º 2
0
def main():
    #parser = argparse.ArgumentParser()
    #parser.add_argument('--mode', choices={'train', 'chat'},
     #                   default='train', help="mode. if not specified, it's in the train mode")
    #args = parser.parse_args()

    if not os.path.isdir(config.PROCESSED_PATH):
        data.prepare_raw_data()
        data.process_data()
    print('Data ready!')
    # create checkpoints folder if there isn't one already
    data.make_dir(config.CPT_PATH)
Exemplo n.º 3
0
def main(args):
    if not os.path.isdir(config.PROCESSED_PATH):
        data.process_data()
    print('Data is ready!')

    data.make_dir(config.CPT_PATH)

    mode = args[-1]

    if mode == 'train':
        train()

    elif mode == 'test':
        predict()
Exemplo n.º 4
0
def main():
    lstm = load_model('model/100211_all/lstm.h5')
    gru = load_model('model/100211_all/gru.h5')
    saes = load_model('model/100211_all/saes.h5')
    models = [lstm, gru, saes]
    models = [lstm]
    names = ['LSTM', 'GRU', 'SAEs']
    lag = 12
    file1 = 'data/100211data/100211_weekend_train.csv'
    file2 = 'data/100211data/100211_weekend_test.csv'
    _, _, X_test, y_test, scaler = process_data(file1, file2, lag)
    y_test = scaler.inverse_transform(y_test.reshape(-1, 1)).reshape(1, -1)[0]

    y_preds = []
    for name, model in zip(names, models):
        if name == 'SAEs':
            X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1]))
        else:
            X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))
        file = 'images/' + name + '.png'
        plot_model(model, to_file=file, show_shapes=True)
        predicted = model.predict(X_test)
        predicted = scaler.inverse_transform(predicted.reshape(-1, 1)).reshape(
            1, -1)[0]
        y_preds.append(predicted[0:288])
        print(name)
        eva_regress(y_test, predicted)

    plot_results(y_test[0:288], y_preds, names)
Exemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--mode',choices={'train','chat'},default='train',help="mode if not specified its in train mode")
    
    args = parser.parse_args()
    
    if not os.path.isdir(config.PROCESSED_PATH):
        data.prepare_raw_data()
        data.process_data()
    print('Data Ready!')
    
    data.make_dir(config.CPT_PATH)
    
    if args.mode == 'train':
        train()
    elif args.mode == 'chat':
        chat()
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--mode', choices={'train', 'chat'},
                        default='train', help="mode. if not specified, it's in the train mode")
    args = parser.parse_args()

    if not os.path.isdir(config.PROCESSED_PATH):
        data.prepare_raw_data()
        data.process_data()
    print('Data ready!')
    # create checkpoints folder if there isn't one already
    data.make_dir(config.CPT_PATH)

    if args.mode == 'train':
        train()
    elif args.mode == 'chat':
        chat()
Exemplo n.º 7
0
def get_data():
    data = None
    if 'pd' not in listdir('{}/processedData/'.format(ROOT)):
        data = process_data()
        pickle.dump(data, open('{}/processedData/pd'.format(ROOT), 'w'))
    else:
        data = pickle.load(open('{}/processedData/pd'.format(ROOT)))
    # print "shape of data:",data.shape
    return data
Exemplo n.º 8
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--mode', choices={'train', 'chat'},
                        default='train', help="mode. if not specified, it's in the train mode")

    args = parser.parse_args()

    if not os.path.isdir(config.PROCESSED_PATH):
        data.prepare_raw_data()
        data.process_data()

    print("Data ready, starting application")
    # create checkpoints folder if there isn't one already
    data.make_dir(config.CPT_PATH)

    if args.mode == 'train':
        train()
    elif args.mode == 'chat':
        chat()
Exemplo n.º 9
0
def main(args):
    mode = None
    if len(args) == 2: mode, preload = args
    else: raise ValueError('Incorrect number of args.')

    if preload == 'none': preload = None
    if mode == 'vis':
        data = get_data()
        model = init_model(
            data=[data['embedding_matrix'], data['len_word_index']])
        return visualizer(model)
    if mode == 'train':
        return runner(50)
    if mode == 'confusion':
        model = init_model(preload=preload, declare=False)
        data = process_data()
        embedding_matrix = data['embedding_matrix']
        len_word_index = data['len_word_index']
        x_train, y_train = data['x_train'], data['y_train']
        x_val, y_val = data['x_val'], data['y_val']
        y_train = np.asarray(y_train, dtype=np.float32)
        x_train_1 = np.zeros(((y_train.shape[0]), max_claim_len))
        x_train_2 = np.zeros(((y_train.shape[0]), max_text_len))
        x_val_1 = np.zeros(((y_val.shape[0]), max_claim_len))
        x_val_2 = np.zeros(((y_val.shape[0]), max_text_len))

        # print x_train[1][1]
        for i in range(0, len(x_train)):
            x_train_1[i] = x_train[i][0]
        for i in range(0, len(x_train)):
            x_train_2[i] = x_train[i][1]

        for i in range(0, len(x_val)):
            x_val_1[i] = x_val[i][0]
        for i in range(0, len(x_val)):
            x_val_2[i] = x_val[i][1]
        y_true = data['y_val']
        y_pred = model.predict([x_val_1, x_val_2])
        y_t = []
        for i in range(0, len(y_true)):
            for j in range(0, 4):
                if y_true[i][j] == 1:
                    y_t.append(j)
        y_p = []
        for i in range(0, len(y_true)):
            for j in range(0, 4):
                if y_pred[i][j] == max(y_pred[i]):
                    y_p.append(j)

        print len(y_t)
        print len(y_p)

        print get_confusion_matrix(y_t, y_p)
    else:
        raise ValueError('Incorrect mode')
Exemplo n.º 10
0
def main():
    start_time = time()
    print("Processing data, please wait...")
    process_data(jungle=False, ml=True)
    print(f"\nData processed in {round(time() - start_time, 3)}s.")
    print(
        "Press Enter to process teams in the ml.yaml file or 'q' followed by Enter to quit."
    )
    while True:
        user_input = input()
        if user_input.lower() in ("q", "quit"):
            break
        try:
            blue, red = get_teams_from_yaml()
            process_teams(blue, red)
        except Exception as e:
            traceback.print_tb(sys.exc_info()[2])
            print(e)
            print("[ERROR] Please check your inputs. See above for more info.")
        print("\nWaiting for next input...")
Exemplo n.º 11
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('mode',
                        choices={'train', 'test', 'translate'},
                        default='train',
                        help="mode. if not specified, it's in the train mode")
    args = parser.parse_args()

    if not os.path.isdir(config.PROCESSED_PATH):
        data.prepare_raw_data()
        data.process_data()
    print('Data ready!')
    # create checkpoints folder if there isn't one already
    data.make_dir(config.CPT_PATH)

    if args.mode == 'train':
        train()
    elif args.mode == 'test':
        bleu_scores = test()
    elif args.mode == 'translate':
        translate()
Exemplo n.º 12
0
Arquivo: run.py Projeto: yyht/openie6
def main(hparams):
    """
    Main training routine specific for this project
    :param hparams:
    """

    if hparams.save != None:
        checkpoint_callback = ModelCheckpoint(
            filepath=hparams.save + '/{epoch:02d}_{eval_acc:.3f}',
            verbose=True,
            monitor='eval_acc',
            mode='max',
            save_top_k=hparams.save_k if not hparams.debug else 0,
            period=0)
    else:
        checkpoint_callback = None

    if hparams.task == 'conj':
        hparams.train_fp = 'data/ptb-train.labels' if hparams.train_fp == None else hparams.train_fp
        hparams.dev_fp = 'data/ptb-dev.labels' if hparams.dev_fp == None else hparams.dev_fp
        hparams.test_fp = 'data/ptb-test.labels' if hparams.test_fp == None else hparams.test_fp
        if hparams.debug:
            hparams.train_fp = hparams.dev_fp = hparams.test_fp = 'data/debug_conj.labels'
    elif hparams.task == 'oie':
        hparams.train_fp = 'data/openie4_labels' if hparams.train_fp == None else hparams.train_fp
        hparams.dev_fp = 'carb/data/dev.txt' if hparams.dev_fp == None else hparams.dev_fp
        hparams.test_fp = 'carb/data/test.txt' if hparams.test_fp == None else hparams.test_fp
        if hparams.debug:
            hparams.train_fp = hparams.dev_fp = hparams.test_fp = 'data/debug_oie.labels'

    hparams.gradient_clip_val = 5 if hparams.gradient_clip_val == None else float(
        hparams.gradient_clip_val)

    train_dataset, val_dataset, test_dataset, meta_data_vocab, all_sentences = data.process_data(
        hparams)
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=hparams.batch_size,
                                  collate_fn=data.pad_data,
                                  shuffle=True,
                                  num_workers=1)
    val_dataloader = DataLoader(val_dataset,
                                batch_size=hparams.batch_size,
                                collate_fn=data.pad_data,
                                num_workers=1)
    test_dataloader = DataLoader(test_dataset,
                                 batch_size=hparams.batch_size,
                                 collate_fn=data.pad_data,
                                 num_workers=1)

    for process in hparams.mode.split('_'):
        globals()[process](hparams, checkpoint_callback, meta_data_vocab,
                           train_dataloader, val_dataloader, test_dataloader,
                           all_sentences)
Exemplo n.º 13
0
def main():
    """parser = argparse.ArgumentParser()
    parser.add_argument('--mode', choices={'train', 'chat'},
                        default='train', help="mode. if not specified, it's in the train mode")
    args = parser.parse_args()"""

    if not os.path.isdir(config.PROCESSED_PATH):
        data.prepare_raw_data()
        data.process_data()
    print('Data ready!')
    # create checkpoints folder if there isn't one already
    data.make_dir(config.CPT_PATH)
    mode = input("Input mode (train|chat): ")
    """if args.mode == 'train':
        train()
    elif args.mode == 'chat':
        chat()"""
    if mode == 'train':
        train()
    else:
        chat()
Exemplo n.º 14
0
def train(FLAGS):
    """
    Train our embeddings.
    """

    # Get data loaders
    print("==> Reading and processing the data ... ", end="")
    train_loader, test_loader, num_unique_words = process_data(
        data_dir=FLAGS.data_dir,
        data_file=FLAGS.data_file,
        vocab_size=FLAGS.vocab_size,
        window_size=FLAGS.window_size,
        split_ratio=FLAGS.split_ratio,
        batch_size=FLAGS.batch_size,
    )
    print("[COMPLETE]")

    # Initialize model, criterion, loss
    print("==> Initializing model components ... ", end="")
    model = MLP(
        D_in=num_unique_words,
        embedding_dim=FLAGS.embedding_dim,
        num_hidden_units=FLAGS.num_hidden_units,
        window_size=FLAGS.window_size,
    )
    # Objective
    criterion = torch.nn.CrossEntropyLoss()
    # Optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=FLAGS.lr)
    print("[COMPLETE]")

    # Train the model
    print("==> Training the model ... [IN PROGRESS]")
    model = training_procedure(
        model=model,
        criterion=criterion,
        optimizer=optimizer,
        train_loader=train_loader,
        test_loader=test_loader,
        num_epochs=FLAGS.num_epochs,
        learning_rate=FLAGS.lr,
        decay_rate=FLAGS.decay_rate,
        max_grad_norm=FLAGS.max_grad_norm,
    )
    print("\n[COMPLETE]")

    # Save the model
    print("==> Saving the model ... [IN PROGRESS]")
    torch.save(model, os.path.join(basedir, FLAGS.data_dir, "model.pt"))
    print("\n[COMPLETE]")
Exemplo n.º 15
0
def train(FLAGS):
    """
    Train our embeddings.
    """

    # Get data loaders
    print ("==> Reading and processing the data ... ", end="")
    train_loader, test_loader, num_unique_words = process_data(
        data_dir=FLAGS.data_dir,
        data_file=FLAGS.data_file,
        vocab_size=FLAGS.vocab_size,
        window_size=FLAGS.window_size,
        split_ratio=FLAGS.split_ratio,
        batch_size=FLAGS.batch_size,
        )
    print ("[COMPLETE]")

    # Initialize model, criterion, loss
    print ("==> Initializing model components ... ", end="")
    model = MLP(
        D_in=num_unique_words,
        embedding_dim=FLAGS.embedding_dim,
        num_hidden_units=FLAGS.num_hidden_units,
        window_size=FLAGS.window_size,
        )
    # Objective
    criterion = torch.nn.CrossEntropyLoss()
    # Optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=FLAGS.lr)
    print ("[COMPLETE]")

    # Train the model
    print ("==> Training the model ... [IN PROGRESS]")
    model = training_procedure(
        model=model,
        criterion=criterion,
        optimizer=optimizer,
        train_loader=train_loader,
        test_loader=test_loader,
        num_epochs=FLAGS.num_epochs,
        learning_rate=FLAGS.lr,
        decay_rate=FLAGS.decay_rate,
        max_grad_norm=FLAGS.max_grad_norm,
        )
    print ("\n[COMPLETE]")

    # Save the model
    print ("==> Saving the model ... [IN PROGRESS]")
    torch.save(model, os.path.join(basedir, FLAGS.data_dir, "model.pt"))
    print ("\n[COMPLETE]")
Exemplo n.º 16
0
def make_kfold():
    '''
    Faz para cada um dos tres exames e para cada um dos tres modelos de rede
    neural o treinalmento usando stratified k-fold cross validation. Retorna 
    uma matriz 3D que contem para cada cambinacao de exame e rede uma lista 
    com quatro valores: acuracia e loss no conjunto de treinamento e no de
    validacao.
    '''
    raw_data = load_data()
    n_folds = 5

    resultado = np.zeros((3, 3, 4))  # [tr_lss,tr_acc, vl_lss,vl_acc]
    for exam_type, name in [(0, 'IGG'), (1, 'IGM'), (2, 'PCR')]:
        print('\n-------------------------------------------')
        print('----------- Rede para ' + name + '------------------')
        print('-------------------------------------------\n')

        x, y = process_data(raw_data, exam_type)
        x = x.to_numpy()
        y = y.to_numpy()

        print("Formato conjunto treinamento: {0} e de teste: {1}\n"\
                        .format(x.shape, y.shape))

        skf = StratifiedKFold(n_splits=n_folds)
        fold_iter = 1
        for train, val in skf.split(x, y):
            # cada iteracao é um novo fold
            # (x_train_f, y_train_f, x_val_f, y_val_f)
            fold_data = create_fold(x, y, train, val)
            for model_n in NN:
                # cada iteracao testa um modelo
                if model_n == 0: network = NN1(fold_data[0].shape[1])
                elif model_n == 1: network = NN2(fold_data[0].shape[1])
                else: network = NN3(fold_data[0].shape[1])

                optimizer = torch.optim.Adam(network.parameters(),
                                             lr=LEARNING_RATE)
                criterion = torch.nn.BCELoss()
                valores = train_network(fold_data, network, optimizer, \
                                        criterion, MODEL_PATH[exam_type])
                resultado[exam_type][model_n] = np.sum([resultado[exam_type][model_n], \
                                                np.asarray(valores)], axis=0)

                print("Fold[{0}] model[{1}] accuracy on validation {2:.2f}% and train {3:.2f}%"\
                        .format(fold_iter, model_n+1, valores[3]*100, valores[1]*100))
            fold_iter += 1

    return np.divide(resultado, n_folds)
Exemplo n.º 17
0
def main(FLAGS):
    """
    """

    if FLAGS.mode == 'train':

        # Process the data
        train_data, test_data = process_data(
            data_dir=FLAGS.data_dir,
            split_ratio=FLAGS.split_ratio,
        )

        # Sample
        sample(
            data=train_data,
            data_dir=FLAGS.data_dir,
        )

        # Load components
        with open(os.path.join(basedir, FLAGS.data_dir, 'char2index.json'),
                  'r') as f:
            char2index = json.load(f)

        # Training
        train(
            data_dir=FLAGS.data_dir,
            char2index=char2index,
            train_data=train_data,
            test_data=test_data,
            num_epochs=FLAGS.num_epochs,
            batch_size=FLAGS.batch_size,
            num_filters=FLAGS.num_filters,
            learning_rate=FLAGS.lr,
            decay_rate=FLAGS.decay_rate,
            max_grad_norm=FLAGS.max_grad_norm,
            dropout_p=FLAGS.dropout_p,
        )

    elif FLAGS.mode == 'infer':

        # Inference
        infer(
            data_dir=FLAGS.data_dir,
            model_name=FLAGS.model_name,
            sentence=FLAGS.sentence,
        )

    else:
        raise Exception('Choose --mode train|infer')
Exemplo n.º 18
0
def main(FLAGS):
    """
    """

    if FLAGS.mode == 'train':

        # Process the data
        train_data, test_data = process_data(
            data_dir=FLAGS.data_dir,
            split_ratio=FLAGS.split_ratio,
            )

        # Sample
        sample(
            data=train_data,
            data_dir=FLAGS.data_dir,
            )

        # Load components
        with open(os.path.join(basedir, FLAGS.data_dir, 'char2index.json'), 'r') as f:
            char2index = json.load(f)

        # Training
        train(
            data_dir=FLAGS.data_dir,
            char2index=char2index,
            train_data=train_data,
            test_data=test_data,
            num_epochs=FLAGS.num_epochs,
            batch_size=FLAGS.batch_size,
            num_filters=FLAGS.num_filters,
            learning_rate=FLAGS.lr,
            decay_rate=FLAGS.decay_rate,
            max_grad_norm=FLAGS.max_grad_norm,
            dropout_p=FLAGS.dropout_p,
            )

    elif FLAGS.mode == 'infer':

        # Inference
        infer(
            data_dir=FLAGS.data_dir,
            model_name=FLAGS.model_name,
            sentence=FLAGS.sentence,
            )

    else:
        raise Exception('Choose --mode train|infer')
Exemplo n.º 19
0
def get_stats():
    config = {
        'unknown_freq': 2,
        'gold_ratio': 0.1,
        'inc_option': 'auxiliary',
        'auxiliary_option': 'detection',
        'seed': 66
    }
    dir_path = '/path/to/working/dir'
    set_random_seed(config['seed'])
    train_file = dir_path + '/data/ontonotes.development.ner'
    print('load data')
    train_data = get_data(train_file)
    gold_data, inc_data = split_data(train_data, config)
    print('get vocabulary')
    word_to_ix, pos_to_ix, ner_to_ix = get_vocabulary(train_data, config)
    config['ner_to_ix'] = ner_to_ix
    config['pos_to_ix'] = pos_to_ix
    config['word_to_ix'] = word_to_ix
    config['output_size'] = len(ner_to_ix)
    print('ner_to_ix', ner_to_ix)
    print('word_to_ix', len(word_to_ix))
    print('process data')
    inc_input_ids, inc_sent_ids, inc_pos_ids, inc_ner_ids = process_data(
        inc_data, word_to_ix, pos_to_ix, ner_to_ix)
    inc_ner_ids = get_incidental_data(inc_sent_ids, inc_input_ids, inc_pos_ids,
                                      inc_ner_ids, config)
    inc_label_counter = Counter()
    for label in inc_ner_ids:
        # if label[0] == 'B' or label[0] == 'I':
        #    label = label[2:]
        inc_label_counter[label] += 1 / len(inc_ner_ids)
    print('inc label counter', inc_label_counter)
    inputs, sent_ids, pos_labels, ner_labels = inc_data
    word_seqs = generate_sent_seqs(inputs, sent_ids)
    pos_seqs = generate_sent_seqs(pos_labels, sent_ids)
    ner_seqs = generate_sent_seqs(ner_labels, sent_ids)
    inc_data = []
    sent_counter = Counter()
    for x in range(len(word_seqs)):
        inc_data.append((word_seqs[x], pos_seqs[x], ner_seqs[x]))
        sent_counter[len(word_seqs[x])] += 1 / len(word_seqs)
    print('average sent length', len(sent_ids) / len(word_seqs))
    print('sent length distribution', sent_counter.items())
Exemplo n.º 20
0
def run():
    seed_everything(args.seed)

    df = pd.read_csv(os.path.join(args.data_dir, "train.csv"))
    df = process_data(df, args.subset)
    df_folds = create_folds(df, args.n_folds)

    train_image_ids = df_folds[df_folds["fold"] != args.fold].index.values
    valid_image_ids = df_folds[df_folds["fold"] == args.fold].index.values

    train_loader = get_train_loader(
        args.data_dir,
        df,
        train_image_ids,
        transforms=get_train_augs(args),
        do_cutmix=args.cutmix,
        batch_size=args.bs,
        num_workers=args.num_workers,
    )

    valid_loader = get_valid_loader(
        args.data_dir,
        df,
        valid_image_ids,
        transforms=get_valid_augs(args),
        batch_size=args.bs,
        num_workers=args.num_workers,
    )

    model = get_model(args.model_variant,
                      model_dir=args.model_dir,
                      checkpoint_path=args.load_path).cuda()

    if args.scheduler == "one_cycle":
        args.steps_per_epoch = len(train_image_ids) // args.bs
        scheduler_class, scheduler_params = get_scheduler(args)

    else:
        scheduler_class, scheduler_params = get_scheduler(args)

    learner = Learner(model, scheduler_class, scheduler_params, hparams=args)
    learner.fit(train_loader, valid_loader)
Exemplo n.º 21
0
def run_test_experiments(config):
    dir_path = '/path/to/working/dir'
    train_file = dir_path + '/data/ontonotes.development.ner'
    test_file = dir_path + '/data/ontonotes.test.ner'
    model_path = dir_path + '/models/MLPNet_' + config['para_option'] + '.pt'
    print('load data')
    train_data = get_data(train_file)
    test_data = get_data(test_file)
    print('get vocabulary and embeddings')
    word_to_ix, pos_to_ix, ner_to_ix = get_vocabulary(train_data, config)
    config['ner_to_ix'] = ner_to_ix
    config['pos_to_ix'] = pos_to_ix
    config['word_to_ix'] = word_to_ix
    config['output_size'] = len(ner_to_ix)
    print('ner_to_ix', ner_to_ix)
    vocab_embeddings = get_vocab_embeddings(word_to_ix)
    print('process data')
    test_input_ids, test_sent_ids, test_pos_ids, test_ner_ids = process_data(
        test_data, word_to_ix, pos_to_ix, ner_to_ix)
    print('get test input features')
    test_input_features = get_word_features(test_input_ids, test_sent_ids,
                                            vocab_embeddings)
    test_data = {
        'inputs': test_input_features,
        'sent_ids': test_sent_ids,
        'labels': test_ner_ids,
        'confidences': [1.0] * len(test_input_features)
    }
    print('test words', len(test_input_features))
    print('build model')
    model, loss_function, optimizer = build_model(config)
    print('load model')
    model.load_state_dict(torch.load(model_path))
    print('test model')
    test_accuracy = evaluate(test_data, model, ner_to_ix, config)
    print('test accuracy', test_accuracy)
Exemplo n.º 22
0
# -*- coding: utf-8 -*-
import pandas as pd

from bokeh.core.properties import field
from bokeh.io import curdoc
from bokeh.layouts import layout
from bokeh.models import (ColumnDataSource, HoverTool, SingleIntervalTicker,
                          Slider, Button, Label, CategoricalColorMapper)
from bokeh.palettes import Spectral6
from bokeh.plotting import figure

from data import process_data

fertility_df, life_expectancy_df, population_df_size, regions_df, years, regions_list = process_data()

df = pd.concat({'fertility': fertility_df,
                'life': life_expectancy_df,
                'population': population_df_size},
               axis=1)

data = {}

regions_df.rename({'Group':'region'}, axis='columns', inplace=True)
for year in years:
    df_year = df.iloc[:,df.columns.get_level_values(1)==year]
    df_year.columns = df_year.columns.droplevel(1)
    data[year] = df_year.join(regions_df.region).reset_index().to_dict('series')

source = ColumnDataSource(data=data[years[0]])

plot = figure(x_range=(1, 9), y_range=(20, 100), title='Gapminder Data', plot_height=300)
Exemplo n.º 23
0
from bokeh.core.properties import field
from bokeh.embed import file_html
from bokeh.layouts import column
from bokeh.models import (
    ColumnDataSource, Plot, Circle, Range1d, LinearAxis, HoverTool, Text,
    SingleIntervalTicker, CustomJS, Slider, CategoricalColorMapper, Legend,
    LegendItem,
)
from bokeh.models.annotations import Title
from bokeh.palettes import Spectral6
from bokeh.resources import JSResources
from bokeh.util.browser import view

from data import process_data

fertility_df, life_expectancy_df, population_df_size, regions_df, years, regions_list = process_data()

sources = {}

region_name = regions_df.Group
region_name.name = 'region'

for year in years:
    fertility = fertility_df[year]
    fertility.name = 'fertility'
    life = life_expectancy_df[year]
    life.name = 'life'
    population = population_df_size[year]
    population.name = 'population'
    new_df = pd.concat([fertility, life, population, region_name], axis=1)
    sources['_' + str(year)] = ColumnDataSource(new_df)
Exemplo n.º 24
0
from tensorflow.keras.models import load_model
from data import process_data

#method used to comute return based on each model.

if __name__ == "__main__":
    model_type = "LSTM"
    model = load_model('models/{}.h5'.format(model_type))

    x_train, y_train, x_test, y_test = process_data(time_series=True, debug=True)
    total_return = 0
    for x in range(x_test.shape[0]):
        y_pred = model.predict(x_test[x:x+1])
        for y in range(y_pred.shape[1]):
            if y_pred[0, y] > 0:
                total_return += (100* y_test[x, y])
            else:
                total_return += (-100* y_test[x, y])

    print(total_return/x_test.shape[0])

    print('ok')
Exemplo n.º 25
0
    """

    if alg == 'kNN':
        clf = KNeighborsClassifier(n_neighbors=3)
    elif alg == 'SVM':
        clf = SVC(kernel='linear')
    elif alg == 'Decision Tree':
        clf = DecisionTreeClassifier(max_depth=5)
    elif alg == 'Logistic Regression':
        clf = LogisticRegression()

    return clf


# Read in dataset
df, feature_dict = process_data()
feature_names = list(feature_dict.keys())
feature_boxes = CheckboxGroup(labels=feature_names, active=[0, 1, 2, 3, 4])
s = ColumnDataSource(data=dict(x=[], real_y=[], predict_y=[]))

pred_ticker = Select(title='Predict:',
                     value='cancer',
                     options=TARGETS,
                     width=sidebar_width)
alg_ticker = Select(title='Classifier:',
                    value='kNN',
                    options=CLASSIFIERS,
                    width=sidebar_width)

# Plot the classification result
p = figure(**plot_params)
Exemplo n.º 26
0
def train(FLAGS):
    """
    Train our embeddings.
    """

    # Get data loaders
    print ("==> Reading and processing the data ... ", end="")
    train_loader, test_loader, num_unique_words, \
        num_unique_documents, word_to_idx = process_data(
            data_dir=FLAGS.data_dir,
            vocab_size=FLAGS.vocab_size,
            window_size=FLAGS.window_size,
            split_ratio=FLAGS.split_ratio,
            batch_size=FLAGS.batch_size,
            )
    print ("[COMPLETE]")

    # Load pretrained GloVe embeddings for our vocab
    embedding_dir = os.path.join(basedir, "../../../../embeddings/glove")
    embedding_dim = 100
    embeddings = get_embeddings(
        embedding_dir=embedding_dir,
        embedding_dim=embedding_dim,
        words=word_to_idx.keys(),
        )

    # Initialize model, criterion, loss
    print ("==> Initializing model components ... ", end="")
    model = MLP(
        D_in_words=num_unique_words,
        D_in_documents=num_unique_documents,
        embedding_dim=FLAGS.embedding_dim,
        num_hidden_units=FLAGS.num_hidden_units,
        window_size=FLAGS.window_size,
        embeddings=embeddings,
        )
    # Objective
    criterion = torch.nn.CrossEntropyLoss()
    # Optimizer
    # Only get the parameters with gradients (we freeze our GloVe embeddings)
    parameters = filter(lambda param: param.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=FLAGS.lr)
    print ("[COMPLETE]")

    # Train the model
    print ("==> Training the model ... [IN PROGRESS]")
    model = training_procedure(
        model=model,
        criterion=criterion,
        optimizer=optimizer,
        train_loader=train_loader,
        test_loader=test_loader,
        num_epochs=FLAGS.num_epochs,
        learning_rate=FLAGS.lr,
        decay_rate=FLAGS.decay_rate,
        max_grad_norm=FLAGS.max_grad_norm,
        log_every=FLAGS.log_every,
        )
    print ("\n[COMPLETE]")

    # Save the model
    print ("==> Saving the model ... [IN PROGRESS]")
    torch.save(model, os.path.join(basedir, FLAGS.data_dir, "model.pt"))
    print ("\n[COMPLETE]")
Exemplo n.º 27
0
LEARNING_RATE = 0.001
MODEL_PATH = [
    './trained_nn/igG.pt', './trained_nn/igM.pt', './trained_nn/PCR.pt'
]

if __name__ == "__main__":

    print('\nCARREGANDO DADOS...')
    data = load_data()
    start = time.time()
    min_max(data)
    for exam_type, name in [(0, 'IGG'), (1, 'IGM'), (2, 'PCR')]:

        print('\n------------------------\n')
        print('[REDE ' + name + '] SEPARANDO DADOS PARA TREINAMENTO...')
        processed_data = process_data(data, exam_type)

        print('[REDE ' + name + '] ADEQUANDO DADOS PARA TREINAMENTO...')
        splitted_data = split_data(processed_data)

        print('[REDE ' + name + '] MONTANDO MODELO...')
        x_train = splitted_data[0]
        y_train = splitted_data[2]

        print('[REDE ' + name + '] ' + str(len(x_train)) + \
                ' ELEMENTOS NO CONJUNTO DE TREINAMENTO...')
        print('[REDE ' + name + '] ' + str(len(y_train)) + \
                ' ELEMENTOS NO CONJUNTO DE VALIDAÇÃO...')

        print('[REDE ' + name + '] INSTANCIANDO REDES...')
        network = NN2(x_train.shape[1])
Exemplo n.º 28
0
        default=False,
        action='store_true')

    args = parser.parse_args()
    data, label_to_id = load_data(args.train, args.dev, args.labels)
    train_data = data['train']
    validation_data = data['dev']
    vocab = None
    model = None
    if args.pretrained_model is not None:
        model, model_config, vocab, reverse_vocab = load_model(
            args.pretrained_model)
    print('\nLoading training data...')
    train_X, train_Y, vocab, reverse_vocab = process_data(
        train_data,
        label_to_id,
        vocab=vocab,
        vocab_size=args.vocab_size,
        max_tokens=args.sequence_length)
    print('Training data loaded.')
    print('\nLoading validation data...')
    validation_X, validation_Y, _, _ = process_data(
        validation_data,
        label_to_id,
        vocab=vocab,
        max_tokens=args.sequence_length)
    print('Validation data loaded.')
    print('\nGenerating batches...')
    train_batches = generate_batches(train_X, train_Y, args.batch_size)
    validation_batches = generate_batches(validation_X, validation_Y,
                                          args.batch_size)
    print('Batches finished generating.')
Exemplo n.º 29
0
def start_training():
    if not os.path.isdir(config.PROCESSED_PATH):
        data.prepare_raw_data()
        data.process_data()
    print('Data ready!')

    # create checkpoints folder if there isn't one already
    data.make_dir(config.CPT_PATH)
    """ Train the bot """
    test_buckets, data_buckets, train_buckets_scale = _get_buckets()
    # in train mode, we need to create the backward path, so forwrad_only is False
    model = Seq2SeqModel(False, config.BATCH_SIZE)
    model.build_graph()

    saver = tf.train.Saver()

    with tf.Session() as sess:
        print('Running session')
        sess.run(tf.global_variables_initializer())
        _check_restore_parameters(sess, saver)

        iteration = model.global_step.eval()
        total_loss = 0
        # Infinite loop
        print('Start training ...')
        train_record_file = open(
            os.path.join(config.PROCESSED_PATH, config.TRAINING_RECORD_FILE),
            'a+')
        test_record_file = open(
            os.path.join(config.PROCESSED_PATH, config.TESTING_RECORD_FILE),
            'a+')
        while True:
            try:
                skip_step = _get_skip_step(iteration)
                bucket_id = _get_random_bucket(train_buckets_scale)
                encoder_inputs, decoder_inputs, decoder_masks = data.get_batch(
                    data_buckets[bucket_id],
                    bucket_id,
                    batch_size=config.BATCH_SIZE)
                start = time.time()
                _, step_loss, _ = run_step(sess, model, encoder_inputs,
                                           decoder_inputs, decoder_masks,
                                           bucket_id, False)
                total_loss += step_loss
                iteration += 1

                if iteration % skip_step == 0:
                    _train_info = 'Iter {}: loss {}, time {}'.format(
                        iteration, total_loss / skip_step,
                        time.time() - start)
                    print(_train_info)
                    train_record_file.write(_train_info + '\n')
                    start = time.time()
                    total_loss = 0
                    saver.save(sess,
                               os.path.join(config.CPT_PATH, 'chatbot'),
                               global_step=model.global_step)
                    if iteration % (10 * skip_step) == 0:
                        # Run evals on development set and print their loss
                        _test_info = _eval_test_set(sess, model, test_buckets)
                        for item in _test_info:
                            print(item)
                            test_record_file.write("%s\n" % item)
                        start = time.time()
                    sys.stdout.flush()
            except KeyboardInterrupt:
                print('Interrupted by user at iteration {}'.format(iteration))
                train_record_file.close()
                test_record_file.close()
	k = 0
	try:
		for i in range(0,len(dialogues)):

			dialogue = dialogues[i].text
			k = k + 2
			if len(name_date[k].text.split()) <  4:
				name = name_date[k].text
			else:
				k = k - 1
				name = name_date[k].text

			if name == "Karan Singla":
				mine = mine + 1
			else:
				others = others + 1

			if dialogue.strip() != '':
				out_text_file.write(dialogue+"\n")
	except:
		continue
#	total = total + len(soup.find_all('p'))
out_text_file.close()

data.FILENAME = out_text_file+"fb_chat.txt"
data.process_data()

print("#total messages", total)
print("#my_messages", mine)
print("#others messages", others)
Exemplo n.º 31
0
             (index + 1), total_correct_predictions / total_predictions))
        generator_tqdm.set_description(description, refresh=False)
    average_loss = total_eval_loss / len(eval_batches)
    eval_accuracy = total_correct_predictions / total_predictions
    print('Final evaluation accuracy: %.4f loss: %.4f' %
          (eval_accuracy, average_loss))


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="""Script to evaluate a trained model on data.""")
    parser.add_argument('model', help='Path to trained model directory')
    parser.add_argument('--test',
                        help='Path to evaluation data.',
                        default=r'./data/test.csv')
    parser.add_argument('--labels',
                        help='Path to label dictionary.',
                        default=r'./data/answers.json')

    args = parser.parse_args()
    data, label_to_id = load_eval_data(args.test, args.labels)
    print('\nLoading test data...')
    model, model_config, vocab, reverse_vocab = load_model(args.model)
    test_X, test_Y, vocab, reverse_vocab = process_data(
        data, label_to_id, vocab=vocab, vocab_size=model_config['vocab_size'])
    print('Test data loaded.')
    batch_size = 32
    batches = generate_batches(test_X, test_Y, batch_size)
    print('Batches finished generating.')
    train_result = eval(model, batches)
Exemplo n.º 32
0
from keras.layers import Convolution1D
from keras.models import Sequential
from keras.layers import Activation, Dense, Flatten, Dropout
from data import process_data
from keras import backend as K
import numpy  as np
import os

if not os.listdir('datasets/processed'):
    process_data()

arrhy_data = np.loadtxt(open('datasets/processed/arrhythmia.csv', 'r'), skiprows=1)
malignant_data = np.loadtxt(open('datasets/processed/malignant-ventricular-ectopy.csv', 'r'), skiprows=1)
arrhy_data = arrhy_data[:len(malignant_data)]
arrhy_len = len(arrhy_data)/500

i = 0
X_train = []
inter_X_train = []
inter_y_train = []
y_train = []
nb_filters = 32
nb_epoch = 10
batch_size = 8
counter = 0

for _ in range(arrhy_len):
    counter += 1
    if not (counter % batch_size):
        X_train.append(inter_X_train)
        y_train.append(inter_y_train)
Exemplo n.º 33
0
    '--fill',
    type=float,
    metavar='F',
    help=
    'Transparency of the filled portion of the graph. If 0 (default), only plots the lines',
    default=0)
cmd = parser.parse_args()

import visualization
import data

if cmd.U:
    data.download_data()

if len(cmd.countries) > 0:
    p, c = data.process_data(*data.load_data())

    row_mask = [cmd.no_daily, cmd.no_cumulative]
    col_mask = [cmd.no_cases, cmd.no_deaths, cmd.no_recoveries, cmd.no_active]

    smooth = {
        'days': cmd.smooth_days,
        'smoothness': cmd.smoothness,
        'type': 'window'
    }
    if cmd.exponential:
        smoot['type'] = 'exponential'

    visualization.main_plot_countries(c,
                                      cmd.countries,
                                      cmd.begin,
Exemplo n.º 34
0
def set_log(args):
    set_seeds(args)
    if True:#args.do_train:
        # preparing embeddings
        tokens_emb = TokenEmbedding("w2v.baidu_encyclopedia.target.word-word.dim300")
        # preparing train datasets
        assert args.raw_train_file != None, "--raw_train_file should be set when training!"
        if not os.path.exists(args.train_file):
            process_data(args.raw_train_file,args.train_file,tokens_emb)
        with open(args.train_file,mode="r",encoding="utf-8") as rfp:
            train_ex = json.load(rfp)
        train_dataset = DuReaderDataset(train_ex)
        train_batch_sampler = paddle.io.DistributedBatchSampler(
            train_dataset, batch_size=args.batch_size, shuffle=True)
        train_data_loader = paddle.io.DataLoader(
            dataset=train_dataset,
            batch_sampler=train_batch_sampler,
            collate_fn=batchify,
            return_list=True)
        # preparing dev datasets
        assert args.raw_dev_file != None, "--raw_dev_file should be set when training!"
        if not os.path.exists(args.dev_file):
            process_data(args.raw_dev_file,args.dev_file,tokens_emb)
        with open(args.train_file,mode="r",encoding="utf-8") as rfp:
            dev_ex = json.load(rfp)
        dev_dataset = DuReaderDataset(dev_ex)
        dev_batch_sampler = paddle.io.DistributedBatchSampler(
            dev_dataset, batch_size=args.dev_batch_size, shuffle=True)
        dev_data_loader = paddle.io.DataLoader(
            dataset=dev_dataset,
            batch_sampler=dev_batch_sampler,
            collate_fn=batchify,
            return_list=True)

        num_training_steps = args.max_steps if args.max_steps > 0 else len(
            train_data_loader) * args.num_train_epochs
        if paddle.distributed.get_rank() == 0:
            dev_count = paddle.fluid.core.get_cuda_device_count()
            logger.info("Device count: %d" % dev_count)
            logger.info("Num train examples: %d" % len(train_dataset))
            logger.info("Num dev examples: %d" % len(dev_dataset))
            logger.info("Max train steps: %d" % num_training_steps)
        model = DocReader(args)
        model.init_lr_scheduler(args, num_training_steps)
        model.init_optimizer(args)
        model.init_loss(args)

        # Training process
        global_step = 0
        tic_train = time.time()
        for epoch in range(args.num_train_epochs):
            for step, batch in enumerate(train_data_loader):
                global_step += 1
                loss = model.update(batch)
                if global_step % args.logging_steps == 0:
                    logger.info("global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s"
                                % (global_step, epoch, step, loss, args.logging_steps / (time.time() - tic_train)))
                    tic_train = time.time()

                if global_step % args.save_steps == 0 or global_step == num_training_steps:
                    if paddle.distributed.get_rank() == 0:
                        output_dir = os.path.join(args.output_dir, "model_%d" % global_step)
                        if not os.path.exists(output_dir):
                            os.makedirs(output_dir)
                        # need better way to get inner model of DataParallel
                        model_file = os.path.join(output_dir + '.ckpt')
                        model.save(model_file)
        model_file = os.path.join(args.output_dir, args.model_name + "-global.ckpt")
        model.save(model_file)

    if args.do_predict:
        # preparing test datasets
        pass
Exemplo n.º 35
0
Arquivo: run.py Projeto: yyht/openie6
def splitpredict(hparams, checkpoint_callback, meta_data_vocab,
                 train_dataloader, val_dataloader, test_dataloader,
                 all_sentences):
    mapping, conj_word_mapping = {}, {}
    hparams.write_allennlp = True
    if hparams.split_fp == '':
        hparams.task = 'conj'
        hparams.checkpoint = hparams.conj_model
        hparams.model_str = 'bert-base-cased'
        hparams.mode = 'predict'
        model = predict(hparams, None, meta_data_vocab, None, None,
                        test_dataloader, all_sentences)
        conj_predictions = model.all_predictions_conj
        sentences_indices = model.all_sentence_indices_conj
        # conj_predictions = model.predictions
        # sentences_indices = model.all_sentence_indices
        assert len(conj_predictions) == len(sentences_indices)
        all_conj_words = model.all_conjunct_words_conj

        sentences, orig_sentences = [], []
        for i, sentences_str in enumerate(conj_predictions):
            list_sentences = sentences_str.strip('\n').split('\n')
            conj_words = all_conj_words[i]
            if len(list_sentences) == 1:
                orig_sentences.append(list_sentences[0] +
                                      ' [unused1] [unused2] [unused3]')
                mapping[list_sentences[0]] = list_sentences[0]
                conj_word_mapping[list_sentences[0]] = conj_words
                sentences.append(list_sentences[0] +
                                 ' [unused1] [unused2] [unused3]')
            elif len(list_sentences) > 1:
                orig_sentences.append(list_sentences[0] +
                                      ' [unused1] [unused2] [unused3]')
                conj_word_mapping[list_sentences[0]] = conj_words
                for sent in list_sentences[1:]:
                    mapping[sent] = list_sentences[0]
                    sentences.append(sent + ' [unused1] [unused2] [unused3]')
            else:
                assert False
        sentences.append('\n')

        count = 0
        for sentence_indices in sentences_indices:
            if len(sentence_indices) == 0:
                count += 1
            else:
                count += len(sentence_indices)
        assert count == len(sentences) - 1

    else:
        with open(hparams.predict_fp, 'r') as f:
            lines = f.read()
            lines = lines.replace("\\", "")

        sentences = []
        orig_sentences = []
        extra_str = " [unused1] [unused2] [unused3]"
        for line in lines.split('\n\n'):
            if len(line) > 0:
                list_sentences = line.strip().split('\n')
                if len(list_sentences) == 1:
                    mapping[list_sentences[0]] = list_sentences[0]
                    sentences.append(list_sentences[0] + extra_str)
                    orig_sentences.append(list_sentences[0] + extra_str)
                elif len(list_sentences) > 1:
                    orig_sentences.append(list_sentences[0] + extra_str)
                    for sent in list_sentences[1:]:
                        mapping[sent] = list_sentences[0]
                        sentences.append(sent + extra_str)
                else:
                    assert False

    hparams.task = 'oie'
    hparams.checkpoint = hparams.oie_model
    hparams.model_str = 'bert-base-cased'
    _, _, split_test_dataset, meta_data_vocab, _ = data.process_data(
        hparams, sentences)
    split_test_dataloader = DataLoader(split_test_dataset,
                                       batch_size=hparams.batch_size,
                                       collate_fn=data.pad_data,
                                       num_workers=1)

    model = predict(hparams,
                    None,
                    meta_data_vocab,
                    None,
                    None,
                    split_test_dataloader,
                    mapping=mapping,
                    conj_word_mapping=conj_word_mapping,
                    all_sentences=all_sentences)

    if 'labels' in hparams.type:
        label_lines = get_labels(hparams, model, sentences, orig_sentences,
                                 sentences_indices)
        f = open(hparams.out + '.labels', 'w')
        f.write('\n'.join(label_lines))
        f.close()

    if hparams.rescoring:
        print()
        print("Starting re-scoring ...")
        print()

        sentence_line_nums, prev_line_num, no_extractions = set(), 0, dict()
        for sentence_str in model.all_predictions_oie:
            sentence_str = sentence_str.strip('\n')
            num_extrs = len(sentence_str.split('\n')) - 1
            if num_extrs == 0:
                if curr_line_num not in no_extractions:
                    no_extractions[curr_line_num] = []
                no_extractions[curr_line_num].append(sentence_str)
                continue
            curr_line_num = prev_line_num + num_extrs
            sentence_line_nums.add(
                curr_line_num
            )  # check extra empty lines, example with no extractions
            prev_line_num = curr_line_num

        # testing rescoring
        inp_fp = model.predictions_f_allennlp
        rescored = rescore(inp_fp,
                           model_dir=hparams.rescore_model,
                           batch_size=256)

        all_predictions, sentence_str = [], ''
        for line_i, line in enumerate(rescored):
            fields = line.split('\t')
            sentence = fields[0]
            confidence = float(fields[2])

            if line_i == 0:
                sentence_str = f'{sentence}\n'
                exts = []
            if line_i in sentence_line_nums:
                exts = sorted(exts,
                              reverse=True,
                              key=lambda x: float(x.split()[0][:-1]))
                exts = exts[:hparams.num_extractions]
                all_predictions.append(sentence_str + ''.join(exts))
                sentence_str = f'{sentence}\n'
                exts = []
            if line_i in no_extractions:
                for no_extraction_sentence in no_extractions[line_i]:
                    all_predictions.append(f'{no_extraction_sentence}\n')

            arg1 = re.findall(
                "<arg1>.*</arg1>",
                fields[1])[0].strip('<arg1>').strip('</arg1>').strip()
            rel = re.findall(
                "<rel>.*</rel>",
                fields[1])[0].strip('<rel>').strip('</rel>').strip()
            arg2 = re.findall(
                "<arg2>.*</arg2>",
                fields[1])[0].strip('<arg2>').strip('</arg2>').strip()
            extraction = Extraction(pred=rel,
                                    head_pred_index=None,
                                    sent=sentence,
                                    confidence=math.exp(confidence),
                                    index=0)
            extraction.addArg(arg1)
            extraction.addArg(arg2)
            if hparams.type == 'sentences':
                ext_str = data.ext_to_sentence(extraction) + '\n'
            else:
                ext_str = data.ext_to_string(extraction) + '\n'
            exts.append(ext_str)

        exts = sorted(exts,
                      reverse=True,
                      key=lambda x: float(x.split()[0][:-1]))
        exts = exts[:hparams.num_extractions]
        all_predictions.append(sentence_str + ''.join(exts))

        if line_i + 1 in no_extractions:
            for no_extraction_sentence in no_extractions[line_i + 1]:
                all_predictions.append(f'{no_extraction_sentence}\n')

        if hparams.out != None:
            print('Predictions written to ', hparams.out)
            predictions_f = open(hparams.out, 'w')
        predictions_f.write('\n'.join(all_predictions) + '\n')
        predictions_f.close()
        return