예제 #1
0
def main(_):
    """
    Start either train or eval. Note hardcoded parts of path for training and eval data
    """
    hps = LM.get_default_hparams().parse(FLAGS.hpconfig)
    hps._set("num_gpus", FLAGS.num_gpus)
    print('*****HYPER PARAMETERS*****')
    print(hps)
    print('**************************')

    vocab = Vocabulary.from_file(os.path.join(FLAGS.datadir, "vocabulary.txt"))

    if FLAGS.mode == "train":
        #hps.batch_size = 256
        dataset = Dataset(vocab, os.path.join(FLAGS.datadir, "train.txt"))
        run_train(dataset,
                  hps,
                  os.path.join(FLAGS.logdir, "train"),
                  ps_device="/gpu:0")
    elif FLAGS.mode.startswith("eval"):
        data_dir = os.path.join(FLAGS.datadir, "eval.txt")
        #predict_model = prediction.Model('/dir/ckpt',os.path.join(FLAGS.datadir, "vocabulary.txt"), hps)

        dataset = Dataset(vocab, data_dir, deterministic=True)
        prefix_words = "<brk>".split()
        predict_model = predict.Model(hps, FLAGS.logdir, FLAGS.datadir)
        print('start input')
        out = predict_model.predictnextkwords(prefix_words, FLAGS.num_sen)
        for row in out:
            print(' '.join(row) + "\n")
        print("len_out: " + str(len(out)))
예제 #2
0
def experiment_harness ( args ):

    experimentLog = {}
    experimentCount = 0 

     # TODO: add ranges to argparser
    paramRanges = { 0: ['max_depth', 3, 20, 'int'],
                    1: ['learning_rate', .001, 1, 'float'],
                    2: ['gamma', 0, 2, 'float'] }   
    
    client = cluster = None
    
    # create logfile and write headers
    logFilename = 'results.csv'
    if not os.path.isfile( logFilename ):
        with open( logFilename, mode='w+') as outputCSV:
            outputCSV.write("elapsedTime,nSamples,asyncMode,nGPUs,nParticles,nEpochs,globalBestAccuracy,globalBest_max_depth,globalBest_learning_rate,globalBest_gamma,globalBest_nTrees,datasetName\n")
    
    for iDataSamples in args.num_rows:
        # generate or load data directly to the GPU
        if args.dataset == 'synthetic':
            dataset = Dataset('synthetic', iDataSamples)
        if args.dataset == 'airline':
            dataset = Dataset('airline', iDataSamples)
        if args.dataset == 'fashion-mnist':
            dataset = Dataset('fashion-mnist', iDataSamples)

        for iGPUs in args.num_gpus:
            for iParticles in args.num_particles:
                for iEpochs in args.num_epochs:
                    client, cluster = launch_dask(iGPUs)
                    if args.async_flag:
                        s = swarm.AsyncSwarm(client, dataset, paramRanges=paramRanges,
                                            nParticles=iParticles, nEpochs=iEpochs)
                    else:
                        s = swarm.SyncSwarm(client, dataset, paramRanges=paramRanges,
                                            nParticles=iParticles, nEpochs=iEpochs)
                    startTime = time.time()
                    s.run_search()
                    elapsedTime = time.time() - startTime

                    # TODO: remove fake nTrees
                    s.globalBest['nTrees'] = 9999

                    stringToOutput = f"{elapsedTime},{iDataSamples},{args.async_flag},{iGPUs},{iParticles},{iEpochs},"
                    stringToOutput += f"{s.globalBest['accuracy']},{s.globalBest['params'][0]},{s.globalBest['params'][1]},{s.globalBest['params'][2]},"
                    stringToOutput += f"{s.globalBest['nTrees']},{args.dataset}\n"
                    print( stringToOutput )
                    
                    with open(logFilename, mode='a') as outputCSV:                        
                        outputCSV.write(stringToOutput)
                    
                    print( 'closing dask cluster in between experiment runs [ sleeping for 5 seconds ]')
                    
                    client.close()
                    cluster.close()                    
                    time.sleep(5)                    
예제 #3
0
def main():
    # configuration
    config = Config()
    config.parse_arg(FLAGS)
    config.setup_path()
    config.print_arg()

    # dataset
    if (config.dataset == 'wikibio'):
        dset = DatasetTable2text(config)
        dset.load()
        config.key_size = len(dset.key2id)
    else:
        dset = Dataset(config)
        dset.build()
    config.vocab_size = len(dset.word2id)
    config.dec_start_id = dset.word2id["_GOO"]
    config.dec_end_id = dset.word2id["_EOS"]
    config.pad_id = dset.pad_id
    config.stop_words = dset.stop_words
    config.id2wordemb = dset.id2wordemb

    # model
    if (config.model_name == "transformer_bow"):
        Model = TransformerBow
    elif (config.model_name == "seq2seq"):
        if (config.dataset == 'wikibio'): Model = Seq2seqData2text
        else: Model = Seq2seq
    elif (config.model_name == "bow_seq2seq"): Model = BowSeq2seq
    elif (config.model_name == "vae"): Model = Vae
    elif (config.model_name == "hierarchical_vae"): Model = Hierarchical_Vae
    elif (config.model_name == "latent_bow"):
        if (config.dataset == 'wikibio'): Model = LatentBowData2text
        else: Model = LatentBow
    elif (config.model_name == "lm"): Model = LM
    else:
        msg = "the model name shoule be in ['transformer_bow', 'seq2seq', 'vae', 'hierarchical_vae', 'latent_low', 'lm'], "
        msg += "current name: %s" % config.model_name
        raise Exception(msg)

    model = Model(config)
    with tf.variable_scope(config.model_name):
        model.build()

    # controller
    controller = Controller(config)
    if (config.model_name != "lm"):
        if ("lm" in controller.eval_metrics_list):
            controller.build_lm(LM, config)
    controller.train(model, dset)
    return
예제 #4
0
def main(_):
    """
    Start either train or eval. Note hardcoded parts of path for training and eval data
    """
    hps = LM.get_default_hparams().parse(FLAGS.hpconfig)
    hps._set("num_gpus", FLAGS.num_gpus)
    print('*****HYPER PARAMETERS*****')
    print(hps)
    print('**************************')

    print_debug('our training DataSetDir=%s  , LogDir=%s' %
                (FLAGS.datadir, FLAGS.logdir))

    #vocab = Vocabulary.from_file(os.path.join(FLAGS.datadir, "1b_word_vocab.txt"))
    vocab = Vocabulary.from_file(os.path.join(FLAGS.datadir, "vocabulary.txt"))
    FLAGS.mode = "train"
    for i in range(10):
        print("Iteration ", i, " phase: ", FLAGS.mode)
        if FLAGS.mode == "train":
            #hps.batch_size = 256
            # dataset = Dataset(vocab, os.path.join(FLAGS.datadir,
            #                                       "training-monolingual.tokenized.shuffled/*"))
            dataset = Dataset(vocab,
                              os.path.join(FLAGS.datadir, "ptb.train.txt"))

            trainlogdir = (
                FLAGS.logdir + str("/") + "train"
            )  #(FLAGS.logdir+str("\\")+"train")#os.path.join(FLAGS.logdir, "train")
            print_debug('train log dir=%s' % (trainlogdir))

            run_train(dataset, hps, trainlogdir, ps_device="/gpu:0")
            print_debug('Finished run_train !!!!!!!!!!!')
        elif FLAGS.mode.startswith("eval"):
            print_debug('eval mode')

            # if FLAGS.mode.startswith("eval_train"):
            #     data_dir = os.path.join(FLAGS.datadir, "training-monolingual.tokenized.shuffled/*")
            # elif FLAGS.mode.startswith("eval_full"):
            #     data_dir = os.path.join(FLAGS.datadir, "heldout-monolingual.tokenized.shuffled/*")
            # else:
            #     data_dir = os.path.join(FLAGS.datadir, "heldout-monolingual.tokenized.shuffled/news.en.heldout-00000-of-00050")
            dataset = Dataset(vocab,
                              os.path.join(FLAGS.datadir, "ptb.test.txt"),
                              deterministic=True)
            run_eval(dataset, hps, FLAGS.logdir, FLAGS.mode, FLAGS.eval_steps)
            print_debug('Finished run_eval !!!!!!!!!!!')

        if FLAGS.mode == "train":
            FLAGS.mode = "eval_full"
        else:
            FLAGS.mode = "train"
예제 #5
0
def build_data(config):
    """
    Procedure to build data

    Args:
        config: defines attributes needed in the function
    Returns:
        creates vocab files from the datasets
        creates a npz embedding file from trimmed glove vectors
    """
    processing_word = get_processing_word(lowercase=True)
    processing_word = get_processing_word(lowercase=True)

    # clean data
    train_filepath, dev_filepath_a = write_clear_data(
        config.train_filename,
        build_dev=config.build_dev_from_trainset,
        dev_ratio=config.dev_ratio)
    test_filepath, dev_filepath_b = write_clear_data(
        config.test_filename,
        build_dev=config.build_dev_from_testset,
        dev_ratio=config.dev_ratio)
    dev_filepath = dev_filepath_a or dev_filepath_b

    # Generators
    dev = Dataset(dev_filepath, processing_word)
    test = Dataset(test_filepath, processing_word)
    train = Dataset(train_filepath, processing_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags = get_vocabs([train, dev, test])
    vocab_glove = get_glove_vocab(config.glove_filename)

    vocab = vocab_words & vocab_glove
    vocab.add(UNK)
    vocab.add(NUM)

    # Save vocab
    write_vocab(vocab, config.words_filename)
    write_vocab(vocab_tags, config.tags_filename)

    # Trim GloVe Vectors
    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.glove_filename,
                                 config.trimmed_filename, config.dim)

    # Build and save char vocab
    train = Dataset(train_filepath)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.chars_filename)
예제 #6
0
def main(_):
    hps = LM.get_default_hparams().parse(FLAGS.hpconfig)
    hps.num_gpus = FLAGS.num_gpus

    vocab = Vocabulary.from_file("1b_word_vocab.txt")

    if FLAGS.mode == "train":
        hps.batch_size = 256
        dataset = Dataset(
            vocab,
            FLAGS.datadir + "/training-monolingual.tokenized.shuffled/*")
        run_train(dataset, hps, FLAGS.logdir + "/train", ps_device="/gpu:0")
    elif FLAGS.mode.startswith("eval_"):
        data_dir = FLAGS.datadir
        dataset = Dataset(vocab, data_dir, deterministic=True)
        run_eval(dataset, hps, FLAGS.logdir, FLAGS.mode, FLAGS.eval_steps,
                 FLAGS.ckptpath)
예제 #7
0
def main(_):
    """
    Start either train or eval. Note hardcoded parts of path for training and eval data
    """
    hps = LM.get_default_hparams().parse(FLAGS.hpconfig)
    hps._set("num_gpus", FLAGS.num_gpus)
    print('*****HYPER PARAMETERS*****')
    print(hps)
    print('**************************')

    vocab = Vocabulary.from_file(
        os.path.join(FLAGS.datadir, "1b_word_vocab.txt"))

    if FLAGS.mode == "train":
        #hps.batch_size = 256
        dataset = Dataset(
            vocab,
            os.path.join(FLAGS.datadir,
                         "training-monolingual.tokenized.shuffled/*"))
        run_train(dataset,
                  hps,
                  os.path.join(FLAGS.logdir, "train"),
                  ps_device="/gpu:0")
    elif FLAGS.mode.startswith("eval_"):
        if FLAGS.mode.startswith("eval_train"):
            data_dir = os.path.join(
                FLAGS.datadir, "training-monolingual.tokenized.shuffled/*")
        elif FLAGS.mode.startswith("eval_full"):
            data_dir = os.path.join(
                FLAGS.datadir,
                "heldout-monolingual.tokenized.shuffled/news.en.heldout-00000-of-00050"
            )
        else:
            data_dir = os.path.join(
                FLAGS.datadir,
                "heldout-monolingual.tokenized.shuffled/news.en.heldout-00000-of-00050"
            )
        dataset = Dataset(vocab, data_dir, deterministic=True)
        run_eval(dataset, hps, FLAGS.logdir, FLAGS.mode, FLAGS.eval_steps)
    elif FLAGS.mode.startswith("infer"):
        data_dir = os.path.join(
            FLAGS.datadir,
            "heldout-monolingual.tokenized.shuffled/news.en.heldout-00000-of-00050"
        )
        dataset = Dataset(vocab, data_dir, deterministic=True)
        run_infer(dataset, hps, FLAGS.logdir, FLAGS.mode, vocab)
예제 #8
0
    def test_dataset(self):
        vocab = Vocabulary.from_file("testdata/test_vocab.txt")
        dataset = Dataset(vocab, "testdata/*")

        def generator():
            for i in range(1, 10):
                yield [0] + list(range(1, i + 1)) + [0]
        counts = [0] * 10
        for seq in generator():
            for v in seq:
                counts[v] += 1

        counts2 = [0] * 10
        for x, y in dataset._iterate(generator(), 2, 4):
            for v in x.ravel():
                counts2[v] += 1
        for i in range(1, 10):
            self.assertEqual(counts[i], counts2[i], "Mismatch at i=%d. counts[i]=%s, counts2[i]=%s" % (i,counts[i], counts2[i]))
예제 #9
0
def main():
  config = Config()
  args = add_arguments(config)
  config.parse_arg(args)
  dset = Dataset(config)
  dset.build()
  # print('debug:')
  # print(dset.id2word[1])
  config.vocab_size = len(dset.word2id)

  # read the transfered sentences
  transfer_analysis = PivotTransferAnalysis(config)

  if(config.model == 'cmu'):
    transfer_analysis.pipeline_w_cmu(dset)
  else:
    transfer_analysis.pipeline(dset)
  return 
예제 #10
0
def main(config, local):
    n_gpu = int(GPU_NUM)
    n_gpu = 1 if n_gpu == 0 else n_gpu
    np.random.seed(config.random_seed)

    if n_gpu > 0:
        torch.cuda.manual_seed_all(config.random_seed)

    # Create data instances
    vocab = Vocabulary(config.vocab_path)

    if config.mode == 'train':
        # Prepare train data loader
        train_dataset, val_dataset = Dataset(vocab), Dataset(vocab)
        train_path = os.path.join(config.data_dir, 'train_data/train_data')
        val_path = os.path.join(config.data_dir, 'train_data/val_data')

        train_dataset.create_instances(train_path,
                                       config.max_seq_length,
                                       type='train')
        val_dataset.create_instances(val_path,
                                     config.max_seq_length,
                                     type='val')

        train_loader = DataLoader(train_dataset,
                                  batch_size=config.batch_size * n_gpu,
                                  shuffle=True)
        val_loader = DataLoader(val_dataset,
                                batch_size=config.batch_size * n_gpu)
    else:
        train_loader, val_loader = None, None

    trainer = Trainer(config, n_gpu, vocab, train_loader, val_loader)

    if nsml.IS_ON_NSML:
        bind_model(trainer.model, vocab, config)

        if config.pause:
            nsml.paused(scope=local)

    if config.mode == 'train':
        trainer.train()
예제 #11
0
파일: train.py 프로젝트: JZCS2018/SMAT
    def __init__(self, opt):
        self.opt = opt
        if opt.dataset_file['val'] == None:
            fnames = [opt.dataset_file['train'], opt.dataset_file['test']]
        else:
            fnames = [
                opt.dataset_file['train'], opt.dataset_file['val'],
                opt.dataset_file['test']
            ]
        tokenizer = build_tokenizer(fnames,
                                    max_seq_len=opt.max_seq_len,
                                    dat_fname='{0}_tokenizer.dat'.format(
                                        opt.dataset))
        embedding_matrix = build_embedding_matrix(
            word2idx=tokenizer.word2idx,
            embed_dim=opt.embed_dim,
            dat_fname='{0}_{1}_embedding_matrix.dat'.format(
                str(opt.embed_dim), opt.dataset))
        self.model = opt.model_class(embedding_matrix, opt).to(opt.device)

        self.trainset = Dataset(opt.dataset_file['train'],
                                tokenizer,
                                dat_fname='{0}_train.dat'.format(opt.dataset))
        # self.weight_classes =torch.tensor( compute_class_weight('balanced', np.unique([i['polarity'] for i in self.trainset.data]), self.trainset[4]), dtype = torch.float).to(self.opt.device)
        # self.valset = ABSADataset(opt.dataset_file['val'], tokenizer)self.trainset[4]
        self.testset = Dataset(opt.dataset_file['test'],
                               tokenizer,
                               dat_fname='{0}_test.dat'.format(opt.dataset))
        assert 0 <= opt.valset_ratio < 1
        if opt.valset_ratio > 0:
            valset_len = int(len(self.trainset) * opt.valset_ratio)
            self.trainset, self.valset = random_split(
                self.trainset, (len(self.trainset) - valset_len, valset_len))
        else:
            self.valset = Dataset(opt.dataset_file['val'],
                                  tokenizer,
                                  dat_fname='{0}_val.dat'.format(opt.dataset))

        if opt.device.type == 'cuda':
            logger.info('cuda memory allocated: {}'.format(
                torch.cuda.memory_allocated(device=opt.device.index)))
        self._print_args()
예제 #12
0
def inference(path, model, vocab, config, **kwargs):
    model.eval()
    test_dataset = Dataset(vocab)
    test_path = os.path.join(path, 'test_data')
    test_dataset.create_instances(test_path,
                                  config.max_seq_length,
                                  type='test')
    test_loader = DataLoader(test_dataset, batch_size=1)

    pred_results = []
    for step, batch in enumerate(test_loader):
        batch = tuple(t.to(device) for t in batch)
        batch = sort_batch(batch)
        input_ids, input_lengths, labels = batch

        outputs = model(input_ids)
        top_1_result = outputs['predicted_intents'][0].item()
        pred_results.append([step, top_1_result])

    return pred_results
def main(_):
    hps = LM.get_default_hparams().parse(FLAGS.hpconfig)
    hps.num_gpus = FLAGS.num_gpus
    
    vocab = Vocabulary.from_file(FLAGS.datadir + "/lm_vocab.txt", hps.vocab_size)

    if FLAGS.mode == "train":
        hps.batch_size = 256  # reset batchsize
        dataset = Dataset(vocab, FLAGS.datadir + "/train/*")
        run_train(dataset, hps, FLAGS.logdir + "/train", ps_device="/gpu:0")
    elif FLAGS.mode.startswith("eval_"):
        if FLAGS.mode.startswith("eval_train"):
            data_dir = FLAGS.datadir + "/train/*"
        elif FLAGS.mode.startswith("eval_test"):
            data_dir = FLAGS.datadir + "/heldout/*"
        print("data_dir:",data_dir)
        dataset = Dataset(vocab, data_dir, deterministic=True)
        run_eval(dataset, hps, FLAGS.logdir, FLAGS.mode, FLAGS.eval_steps)
    elif  FLAGS.mode.startswith("predict_next"):
        data_dir = "data/news.en.heldout-00001-of-00050"
        dataset = Dataset(vocab, data_dir)
        predict_next(dataset, hps, FLAGS.logdir, FLAGS.mode, FLAGS.eval_steps,vocab) 
예제 #14
0
def run_hpo(args):

    client, cluster = launch_dask(args.num_gpus, args.min_gpus, args.k8s,
                                  args.adapt, args.spec)

    # generate or load data directly to the GPU
    if args.dataset == 'synthetic':
        dataset = Dataset('synthetic', args.num_rows)
    if args.dataset == 'airline':
        dataset = Dataset('airline', args.num_rows)
    if args.dataset == 'fashion-mnist':
        dataset = Dataset('fashion-mnist', args.num_rows)

    # TODO: add ranges to argparser
    paramRanges = {
        0: ['max_depth', 3, 20, 'int'],
        1: ['learning_rate', .001, 1, 'float'],
        2: ['gamma', 0, 2, 'float']
    }

    if args.async_flag:
        s = swarm.AsyncSwarm(client,
                             dataset,
                             paramRanges=paramRanges,
                             nParticles=args.num_particles,
                             nEpochs=args.num_epochs)
    else:
        s = swarm.SyncSwarm(client,
                            dataset,
                            paramRanges=paramRanges,
                            nParticles=args.num_particles,
                            nEpochs=args.num_epochs)

    s.run_search()

    # Shut down K8S workers
    close_dask(cluster, args.k8s)
예제 #15
0
def main():
    assert os.path.exists(model_dir)
    assert os.path.exists(conf_path)
    assert os.path.exists(summary_dir)
    assert os.path.exists(FLAGS.data_prefix + '.train.txt') and \
            os.path.exists(FLAGS.data_prefix + '.valid.txt') and \
            os.path.exists(FLAGS.data_prefix + '.test.txt')
    assert FLAGS.mode in ['train', 'test']

    logger = logging.getLogger("lm_zh")
    logger.setLevel(logging.INFO)
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    if log_path:
        file_handler = logging.FileHandler(log_path)
        file_handler.setLevel(logging.INFO)
        file_handler.setFormatter(formatter)
        logger.addHandler(file_handler)
    else:
        console_handler = logging.StreamHandler()
        console_handler.setLevel(logging.INFO)
        console_handler.setFormatter(formatter)
        logger.addHandler(console_handler)

    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = '0'

    logger.info('Parse config file ...')
    config = default_config.parse(conf_path)
    logger.info('Running with config: {}'.format(config.items))

    if FLAGS.mode == 'test':
        config.batch_size *= 2

    logger.info('Build vocab and dataset ...')
    dataset = Dataset(FLAGS.data_prefix,
                      config.num_steps,
                      config.batch_size,
                      train=(FLAGS.mode == 'train'))

    print('Use algo:', config.algo)

    if FLAGS.mode == 'train':
        train(config, dataset, model_dir, summary_dir)
    elif FLAGS.mode == 'test':
        test(config, dataset, model_dir, summary_dir)
예제 #16
0
def main(_):
    hvd.init()
    hps = LM.get_default_hparams().parse(FLAGS.hpconfig)
    hps.num_gpus = FLAGS.num_gpus

    vocab = Vocabulary.from_file(FLAGS.vocab)
    hps.vocab_size = vocab.num_tokens

    config = tf.ConfigProto()
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    os.environ["CUDA_VISIBLE_DEVICES"] = str(hvd.local_rank())

    if FLAGS.logdir is None:
        FLAGS.logdir = os.path.join('/tmp',
                                    'lm-run-{}'.format(int(time.time())))
        print('logdir: {}'.format(FLAGS.logdir))
    hps.batch_size = 256
    dataset = Dataset(vocab, FLAGS.datadir)
    run_train(dataset,
              hps,
              FLAGS.logdir + '/train',
              ps_device='/gpu:' + str(hvd.local_rank()))
예제 #17
0
def build_data(Config):
    """
    Procedure to build data
    Args:
        Config: defines attributes needed in the function
    Returns:
        creates vocab files from the datasets
    """
    # Generators
    train = Dataset(words_filename=Config.source_path,
    tags_filename=Config.source_tgt_path)
    # test = Dataset(words_filename=Config.test_path,
    # tags_filename=Config.test_tgt_path)

    # Build Word and Tag vocab
    # vocab_words, vocab_tags = get_vocabs([train, test])

    # vocab_words.add(UNK)

    # Save vocab
    # write_vocab(vocab_words, Config.words_vocab)
    # write_vocab(vocab_tags, Config.tags_vocab)
    vocab_build(train, Config.min_count, Config.words_vocab)
예제 #18
0
def main():
    ''' Main function '''
    parser = argparse.ArgumentParser()

    ## General experimental parameters
    parser.add_argument('-exp', type=str,
                        default='')  # which experiment to run
    parser.add_argument('-reddit_path', type=str,
                        default='data/posts.npy')  # path to reddit post data

    parser.add_argument(
        '-val_interval', type=int,
        default=1000)  # how often to evaluate models during training
    parser.add_argument('-size', type=str,
                        default='med')  # maximum post history length
    parser.add_argument('-n_user', type=int,
                        default=8000)  # number of users for experiment
    parser.add_argument(
        '-no_try',
        action='store_true')  # whether or not to run code in a try-except

    ## arguments for training HBERT models
    parser.add_argument(
        '-max_tokens_batch', type=int,
        default=10000)  # how big batches we allow BERT to run (depends on GPU)
    parser.add_argument(
        '-lr', type=float,
        default=0.00001)  # learning rate for HBERT classification layers
    parser.add_argument('-bs', type=int, default=10)  # batch size for training
    parser.add_argument('-n_it', type=int,
                        default=8000)  # number of iterations to train for
    parser.add_argument('-seq', action='store_true')
    parser.add_argument(
        '-temp_file_path', type=str,
        default='')  # path to directory for temp files. if '' this is not used
    parser.add_argument('-preembed_size', type=int,
                        default=10)  # internal hidden size

    opt = parser.parse_args()

    #
    #
    #
    ###################################
    ###################################
    #
    #
    #
    """
    The first section loads the reddit user data,
    
    does some preprocessing, and carries out train/val/test split
    
    
    """
    exp_name = 'experiment_' + str(opt.exp) + '_' + opt.size
    if opt.n_user != 8000:
        exp_name += '_nuser{}'.format(opt.n_user)

    exp_classes = experiment_dict[opt.exp]

    print(exp_classes)

    # creat data if not done already
    if (not os.path.isdir(exp_name)):

        # '/projects/bdata/datasets_peter/dataset_3/posts.npy'
        Reddit_posts = np.load(opt.reddit_path, allow_pickle=True)[0]

        Reddit_posts = order_users(Reddit_posts)[:opt.n_user]

        try:
            opt.size = int(opt.size)
            Reddit_posts = [user[:opt.size] for user in Reddit_posts]
            opt.size = 'size' + str(opt.size)
        except:
            if opt.size == 'xsmall':
                Reddit_posts = [user[:50] for user in Reddit_posts]
            elif opt.size == 'test':
                Reddit_posts = [user[:2] for user in Reddit_posts]
            elif opt.size == 'min':
                Reddit_posts = [user[:10] for user in Reddit_posts]
            elif opt.size == 'small':
                Reddit_posts = [user[:100] for user in Reddit_posts]
            elif opt.size == 'med':
                Reddit_posts = [user[:200] for user in Reddit_posts]
            elif opt.size == 'big':
                pass
            else:
                assert (False)

        print(exp_classes)
        Users, Users_full_posts, T, Y, classes = process_users_synth(
            Reddit_posts,  #user_list, #order_users(MH2SW_posts)+ order_users(MH_posts), 
            exp_classes,
            keep_class=True)

        os.mkdir(exp_name)

        np.save('{}/data.npy'.format(exp_name),
                [Users, Users_full_posts, T, Y, classes])

    Users, Users_full_posts, T, Y, classes = tuple(
        np.load('{}/data.npy'.format(exp_name), allow_pickle=True))

    #
    #
    #
    ###################################
    ###################################
    #
    #
    #
    """
    This section produces feature sets for the different models
    
    Feature sets represent some featurization of user histories
    
    e.g.
    X_chi and X_chi_counts use bag of words, with and without counts
    
    X_HBERT largely leaves user histories as text
    
    X_LDA processes X_chi_counts using LDA
    
    refer to paper for further details
    
    """
    print('starting data loading...')

    s_time = time.time()
    X_chi = get_features_chi(Users_full_posts)
    X_chi_counts = get_features_chi(Users_full_posts, counts=True)
    print('time = {}'.format(time.time() - s_time))
    X_chi_uni = get_features_chi(Users_full_posts, include_bigrams=False)
    X_HBERT = get_features_HBERT(Users, tokenizer, pretokenize=True)

    print('fitting LDA...')
    n_topics = 20
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=0)

    X_LDA = lda.fit_transform(X_chi_counts)
    print('fit LDA')

    X_inds = list(range(len(Users)))

    dataset = Dataset(X_inds, T, Y, train_frac=0.4, val_frac=0.1)

    inds_train = [data[0] for data in dataset.train_epoch(true_set=True)]
    inds_val = [data[0] for data in dataset.valid_epoch()]
    inds_test = [data[0] for data in dataset.test_epoch()]

    np.save('{}/{}.npy'.format(exp_name, 'inds_dict'), {
        'inds_train': inds_train,
        'inds_val': inds_val,
        'inds_test': inds_test
    })

    print('{} train examples, {} val examples, {} test examples'.format(
        len(inds_train), len(inds_val), len(inds_test)))
    time.sleep(3)

    print('done data loading')

    #
    #
    #
    ###################################
    ###################################
    #
    #
    #
    """
    The next section defines model_dict data structures, which are used to 
    organize training and evaluation of the models
    

    First, model dicts are defined, than added to a list of models to run, model_dicts
    """

    ## instantiate model dicts

    model_dict_0 = {
        'X':
        X_chi,
        'model':
        LogReg_PT_propensity_model(input_dim=len(X_chi[0]),
                                   lr=0.0001,
                                   experiment_name=exp_name + '/LR_12_' +
                                   exp_name),
        'model_name':
        'Logistic_Regression'
    }

    model_dict_1 = {
        'X':
        X_chi,
        'model':
        NN_PT_propensity_model(input_dim=len(X_chi[0]),
                               lr=0.0001,
                               experiment_name=exp_name + '/NN_12_' +
                               exp_name),
        'model_name':
        'Simple_NN'
    }

    model_dict_2 = {
        'X':
        X_chi_uni,
        'model':
        LogReg_PT_propensity_model(input_dim=len(X_chi_uni[0]),
                                   lr=0.0001,
                                   experiment_name=exp_name + '/LR_1_' +
                                   exp_name),
        'model_name':
        'Logistic_Regression_(1gram)'
    }

    model_dict_3 = {
        'X':
        X_chi_uni,
        'model':
        NN_PT_propensity_model(input_dim=len(X_chi_uni[0]),
                               lr=0.0001,
                               experiment_name=exp_name + '/NN_1_' + exp_name),
        'model_name':
        'Simple_NN_(1gram)'
    }

    # A temporary file can be added to do some precalculation, making HBERT more efficient
    # '/projects/bdata/datasets_peter/precalc/'
    d_input = None
    if len(opt.temp_file_path) > 0:
        d = tempfile.TemporaryDirectory(prefix=opt.temp_file_path)
        d_input = d.name + '/' + exp_name

    model_dict_4 = {
        'X':
        X_HBERT,
        'model':
        Hierarchical_BERT_propensity_model(
            n_it=opt.n_it,
            val_interval=opt.val_interval,
            lr=opt.lr,
            batch_size=opt.bs,
            h_size_sent=1000,
            h_size_user=1000,
            tokenize=False,
            precalc_path=d_input,
            experiment_name=exp_name + '/hbert' + exp_name,
            seq=opt.seq,
            max_tokens_batch=opt.max_tokens_batch,
            preembed_size=opt.preembed_size),
        'model_name':
        'HBERT'
    }

    model_dict_5 = {
        'X':
        X_chi_counts,
        'model':
        LogReg_PT_propensity_model(input_dim=len(X_chi[0]),
                                   lr=0.0001,
                                   experiment_name='LR_12_' + exp_name),
        'model_name':
        'Logistic_Regression_counts'
    }

    model_dict_6 = {
        'X':
        X_chi_counts,
        'model':
        NN_PT_propensity_model(input_dim=len(X_chi[0]),
                               lr=0.0001,
                               experiment_name='NN_12_' + exp_name),
        'model_name':
        'Simple_NN_counts'
    }

    model_dict_8 = {
        'X':
        X_LDA,
        'model':
        LogReg_PT_propensity_model(input_dim=n_topics,
                                   lr=0.0001,
                                   experiment_name='LR_12_' + exp_name),
        'model_name':
        'Logistic_Regression_LDA'
    }

    model_dict_9 = {
        'X':
        X_LDA,
        'model':
        NN_PT_propensity_model(input_dim=n_topics,
                               lr=0.0001,
                               experiment_name='NN_12_' + exp_name),
        'model_name':
        'Simple_NN_LDA'
    }

    # A temporary file can be added to do some precalculation, making HBERT more efficient
    d_input = None
    if len(opt.temp_file_path) > 0:
        d = tempfile.TemporaryDirectory(prefix=opt.temp_file_path)
        d_input = d.name + '/' + exp_name

    model_dict_7 = {
        'X':
        X_HBERT,
        'model':
        Average_BERT_propensity_model(n_it=opt.n_it,
                                      val_interval=opt.val_interval,
                                      lr=opt.lr,
                                      batch_size=opt.bs,
                                      h_size_sent=1000,
                                      h_size_user=768,
                                      tokenize=False,
                                      precalc_path=d_input,
                                      experiment_name='avgbert' + exp_name,
                                      seq=opt.seq,
                                      max_tokens_batch=opt.max_tokens_batch),
        #preembed_size = opt.preembed_size),
        'model_name':
        'avgBERT'
    }

    # a list of dictionaries to keep track of all models to run
    model_dicts = [
        model_dict_8, model_dict_9, model_dict_5, model_dict_6, model_dict_0,
        model_dict_1, model_dict_2, model_dict_3, model_dict_4
    ]

    #
    #
    #
    ###################################
    ###################################
    #
    #
    #
    """
    This last section runs each model for the given experiment
    
    
    
    """

    ## loop over the models
    stat_dicts = []

    if opt.no_try:
        for i, model_dict in enumerate(model_dicts):
            # only run the model if you haven't yet
            if not os.path.isfile('{}/{}.npy'.format(
                    exp_name, model_dict['model_name'])):
                dataset.update_X(model_dict['X'])
                # fit model
                model = model_dict['model']
                _, stat_dict = train_propensity_model(model,
                                                      dataset,
                                                      data_test=True)

                stat_dicts += [stat_dict]

                np.save('{}/{}.npy'.format(exp_name, model_dict['model_name']),
                        stat_dict)

            stat_dict = np.load('{}/{}.npy'.format(exp_name,
                                                   model_dict['model_name']),
                                allow_pickle=True).item()
            print(stat_dict)
            print(type(stat_dict))

            stat_dict_print = {
                key: stat_dict[key]
                for key in [
                    k for k in stat_dict.keys()
                    if 'P_' not in k and 'Z_' not in k and 'Y_' not in k
                ]
            }

            print('model {}, statdict {}'.format(model_dict['model_name'],
                                                 stat_dict_print))

        return

    for i, model_dict in enumerate(model_dicts):
        try:
            # only run the model if you haven't yet
            if not os.path.isfile('{}/{}.npy'.format(
                    exp_name, model_dict['model_name'])):
                dataset.update_X(model_dict['X'])
                # fit model
                model = model_dict['model']
                _, stat_dict = train_propensity_model(model,
                                                      dataset,
                                                      data_test=True)

                stat_dicts += [stat_dict]

                np.save('{}/{}.npy'.format(exp_name, model_dict['model_name']),
                        stat_dicts)

            stat_dict = np.load('{}/{}.npy'.format(exp_name,
                                                   model_dict['model_name']),
                                allow_pickle=True).item()

            stat_dict_print = {
                key: stat_dict[key]
                for key in [k for k in stat_dict.keys() if 'P_' not in k]
            }

            print('model {}, statdict {}'.format(model_dict['model_name'],
                                                 stat_dict_print))

        except:
            print('model {} FAILED'.format(model_dict['model_name']))
예제 #19
0
파일: train.py 프로젝트: simon555/LM_word
os.makedirs(directoryOut)
os.makedirs(directoryData)
os.makedirs(directoryCkpt)
os.makedirs(directoryOutLogs)

num_words = None

seq_len = 25
batch_size = 16
valid_batch_size = 16  ## Needs to be smaller due to memory issues
embed_size = 64
num_epochs = 20
hidden_size = 64
num_layers = 1

dataset = Dataset(data_dir, num_words)
dataset.set_batch_size(batch_size)
dataset.set_seq_len(seq_len)
dataset.save(dataset_specific_info)

params = {}

#take account of the 0 token for padding
params['vocab_size'] = dataset.vocab_size + 1
params['num_classes'] = dataset.vocab_size
params['batch_size'] = batch_size
params['valid_batch_size'] = valid_batch_size
params['seq_len'] = seq_len
params['hidden_dim'] = hidden_size
params['num_layers'] = num_layers
params['embed_size'] = embed_size
예제 #20
0
def main():
    path_embedding_glove = './glove.6B.100d.txt'
    path_dataset_train = './datasets/restaurants_train.json'
    #path_dataset_train = './datasets/mini.json'
    path_dataset_trial = './datasets/restaurants_trial.json'
    path_study_cases = './datasets/study_cases.json'
    path_saved = './saved_at/'
    path_log = './log_at.txt'

    embedding = EmbeddingGlove(path_embedding_glove)
    dataset_train = Dataset(path_dataset_train, embedding)
    dataset_trial = Dataset(path_dataset_trial, embedding)
    study_cases = Dataset(path_study_cases, embedding)

    max_sentence_len_train = dataset_train.metadata.max_sentence_len
    max_sentence_len_trial = dataset_trial.metadata.max_sentence_len

    max_aspect_len_train = dataset_train.metadata.max_aspect_len
    max_aspect_len_trial = dataset_trial.metadata.max_aspect_len

    # ======================================================================

    embedding_matrix = torch.tensor(embedding.matrix, dtype=torch.float)
    embedding_dim = embedding.embedding_dim
    hidden_dim = 150
    polarity_dim = 3
    batch_size = 40
    max_sentence_len = max(max_sentence_len_train, max_sentence_len_trial)
    max_aspect_len = max(max_aspect_len_train, max_aspect_len_trial)
    epochs = 40
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    print('embedding_dim: ' + str(embedding_dim))
    print('hidden_dim: ' + str(hidden_dim))
    print('polarity_dim: ' + str(polarity_dim))
    print('batch_size: ' + str(batch_size))
    print('max_sentence_len: ' + str(max_sentence_len))
    print('max_aspect_len: ' + str(max_aspect_len))
    print('epochs: ' + str(epochs))
    print('device: ' + str(device))

    # ======================================================================

    batches_train_sentences, batches_train_aspects, batches_train_polarities = dataset_train.GenerateBatches(
        batch_size, max_sentence_len, max_aspect_len)

    batches_trial_sentences, batches_trial_aspects, batches_trial_polarities = dataset_trial.GenerateBatches(
        batch_size, max_sentence_len, max_aspect_len)

    study_cases_sentences, study_cases_aspects, study_cases_polarities = study_cases.GenerateBatches(
        batch_size, max_sentence_len, max_aspect_len)

    num_batches = len(batches_train_sentences)

    # ======================================================================

    model = AT(embedding_matrix, embedding_dim, hidden_dim, polarity_dim,
               max_sentence_len, max_aspect_len)
    model.to(device)
    loss_function = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.01)

    # ======================================================================

    train = False
    file_name_saved = 'aoa_epoch38_acuracy0.951239224137931'

    if train:
        file_log = open(path_log, 'w')
        max_acuracy = 0.0

        for epoch in range(epochs):
            print('\n========== Epoch ' + str(epoch) + ' ==========')

            model.train()

            for i in range(num_batches):
                optimizer.zero_grad()

                batch_sentences = batches_train_sentences[i]
                batch_sentences = torch.tensor(batch_sentences,
                                               dtype=torch.long).to(device)

                batch_aspects = batches_train_aspects[i]
                batch_aspects = torch.tensor(batch_aspects,
                                             dtype=torch.long).to(device)

                batch_polarities = batches_train_polarities[i]
                batch_polarities = torch.tensor(batch_polarities,
                                                dtype=torch.long).to(device)

                prediction, _ = model(batch_sentences, batch_aspects)

                loss = loss_function(prediction, batch_polarities)
                loss.backward()
                optimizer.step()

            acuracy, f1, _ = CalculateAcuracyF1(model, device,
                                                batches_train_sentences,
                                                batches_train_aspects,
                                                batches_train_polarities)

            print('acuracy train: ' + str(acuracy))
            print('f1 train: ' + str(f1))

            file_log.write('epoch: ' + str(epoch) + '\n')
            file_log.write('acuracy_train: ' + str(acuracy) + ' f1_train: ' +
                           str(f1) + '\n')

            if acuracy >= max_acuracy:
                max_acuracy = acuracy
                file_name_saved = 'at_epoch' + str(epoch) + '_acuracy' + str(
                    acuracy)
                torch.save(model.state_dict(), path_saved + file_name_saved)
                print('saved: ' + path_saved + file_name_saved)

        file_log.close()

    else:
        print('\n========== Load saved ==========')

        model.load_state_dict(torch.load(path_saved + file_name_saved))
        print('load: ' + path_saved + file_name_saved)

        acuracy, f1, _ = CalculateAcuracyF1(model, device,
                                            batches_train_sentences,
                                            batches_train_aspects,
                                            batches_train_polarities)
        print('acuracy train: ' + str(acuracy))
        print('f1 train: ' + str(f1))

# ======================================================================

    print('\n********** Trial dataset **********')

    acuracy, f1, indices_failures = CalculateAcuracyF1(
        model, device, batches_trial_sentences, batches_trial_aspects,
        batches_trial_polarities)
    print('acuracy trial: ' + str(acuracy))
    print('f1 trial: ' + str(f1))
    print('indices failures:')
    print(indices_failures)

    for index in indices_failures:
        print(dataset_trial.opinions[index])

# ======================================================================

    print('\n********** Study cases **********')

    with torch.no_grad():
        for i in range(len(study_cases_sentences)):
            batch_sentences = study_cases_sentences[i]
            batch_sentences = torch.tensor(batch_sentences,
                                           dtype=torch.long).to(device)

            batch_aspects = study_cases_aspects[i]
            batch_aspects = torch.tensor(batch_aspects,
                                         dtype=torch.long).to(device)

            batch_polarities = study_cases_polarities[i]

            prediction, attention = model(batch_sentences, batch_aspects)

            print('Sentences: ')
            print(batch_sentences)
            print('Aspects: ')
            print(batch_aspects)
            print('Polarities: ')
            print(batch_polarities)
            print('Prediction: ')
            print(prediction)
            print('Attention: ')
            print(attention.squeeze(-1))
예제 #21
0
                yield (s, sl, q, a, al)
                del (batch_s[:], batch_sl[:], batch_q[:], batch_a[:],
                     batch_al[:])

        #batch = [(s,sl,q,a,al) for s,sl,q,a,al in zip(batch_s,batch_sl,batch_q,batch_a,batch_al)]
        #batch = sorted(batch, key=lambda tup:len(tup[0]),reverse=True)
        #s,sl,q,a,al = zip(*batch)
        #if len(batch_s) == batch_size:
        #        yield (s,sl,q,a,al)


if __name__ == '__main__':

    from data_utils import Dataset

    for i in np.arange(14, 20):
        print('start')
        data = Dataset(i + 1)
        data.preprocess('train')
        data.preprocess('valid')
        data.preprocess('test')
        pickle.dump(data, open('data/qa' + str(i + 1) + '.pickle', 'wb'))
        print(i)
#dataset = pickle.load(open('data/qa2.pickle','rb'))

#for idx, (s, sl, q, a, al) in enumerate(dataset.data_loader('train')):
#print(s[0].shape)
#print(sl[0])
#print(q[1])
#print(dataset.idx2word(a))
예제 #22
0
def train_pos(args):
    src_embedding = None
    target_embedding = None
    logger = get_logger(args.log)
    logger.info('Model Type: {}'.format(args.type))
    if os.path.exists(args.config) and (not args.config == 'debug.json'):
        logger.info('Loading config from {}'.format(args.config))
        config = json.load(open(args.config, 'r'))
        try:
            vocab_word = pickle.load(open(config['word'], 'rb'))
            vocab_tag = pickle.load(open(config['tag'], 'rb'))
            target_vocab_word = pickle.load(open(config['target_word'], 'rb'))

            assert len(vocab_word) == config['nword']
            assert len(vocab_tag) == config['ntag']
            assert len(target_vocab_word) == config['ntarword']

            if args.use_pretrain_src:
                _, src_embedding = load_pre_train(args.src_embedding)

            if args.use_pretrain_target:
                _, target_embedding = load_pre_train(args.target_embedding)

        except Exception as e:
            logger.error(e)
            exit(0)
    else:
        if args.use_pretrain_src:
            pre_dictionary, src_embedding = load_pre_train(args.src_embedding)
            vocab_word, vocab_tag = load_vocab(args.train_file, pre_dictionary)
        else:
            vocab_word, vocab_tag = load_vocab(args.train_file)

        if args.use_pretrain_target:
            pre_dictionary, target_embedding = load_pre_train(
                args.target_embedding)
            target_vocab_word, _ = load_vocab(args.train_file, pre_dictionary)
        else:
            target_vocab_word, _ = load_vocab(args.target_train_file)

        i = 0
        while os.path.exists('./.cache/vocab_{}.pickle'.format(
                str(i))) or os.path.exists('./.cache/tag_{}.pickle'.format(
                    str(i))):
            i += 1
        if not os.path.exists('./.cache'):
            os.makedirs('./.cache')
        with open('./.cache/vocab_{}.pickle'.format(str(i)),
                  'wb') as vocab, open('./.cache/tag_{}.pickle'.format(
                      str(i)), 'wb') as tag, open(
                          './.cache/target_vocab_{}.pickle'.format(str(i)),
                          'wb') as tar_vocab:
            pickle.dump(vocab_word, vocab)
            pickle.dump(vocab_tag, tag)
            pickle.dump(target_vocab_word, tar_vocab)

        with open(args.config, 'w+') as config:
            json.dump(
                {
                    'word': './.cache/vocab_{}.pickle'.format(str(i)),
                    'tag': './.cache/tag_{}.pickle'.format(str(i)),
                    'target_word': './.cache/target_vocab_{}.pickle'.format(
                        str(i)),
                    'nword': len(vocab_word),
                    'ntag': len(vocab_tag),
                    'ntarword': len(target_vocab_word)
                },
                config,
                indent='\t')

    nword = len(vocab_word)
    ntag = len(vocab_tag)
    ntarword = len(target_vocab_word)

    logger.info("Src:    {}  {}".format(nword, ntag))
    logger.info("Target: {}".format(ntarword))
    logger.info("Flag:   {}".format(args.flag))
    logger.info(
        "Src embed trainable: {}".format(not args.disable_src_embed_training))
    logger.info("\ntrain:{}\ndev  :{}\ntest :{}\n\n".format(
        args.train_file, args.dev_file, args.test_file))
    logger.info("\nTarget: \ntrain:{}\ndev  :{}\ntest :{}\n".format(
        args.target_train_file, args.target_dev_file, args.target_test_file))
    logger.info("MSG:   {}\n".format(args.msg))
    logger.info("lr_ratio: {}\n".format(str(args.lr_ratio)))
    logger.info("penalty_ratio: {}\n".format(str(args.penalty_ratio)))
    logger.info("penalty: {}\n".format(str(args.penalty)))

    processing_word = get_processing(vocab_word)
    processing_tag = get_processing(vocab_tag)
    processing_target_word = get_processing(target_vocab_word)

    src_train = Dataset(args.train_file, processing_word, processing_tag, None)
    src_dev = Dataset(args.dev_file, processing_word, processing_tag, None)
    src_test = Dataset(args.test_file, processing_word, processing_tag, None)

    target_train = Dataset(args.target_train_file, processing_target_word,
                           processing_tag)
    target_dev = Dataset(args.target_dev_file, processing_target_word,
                         processing_tag)
    target_test = Dataset(args.target_test_file, processing_target_word,
                          processing_tag)

    src_len = len(src_train)
    target_len = len(target_train)
    ratio = target_len / (src_len + target_len)
    logger.info("\nsrc:    {}\ntarget: {}\n".format(src_len, target_len))

    # ratio = 0.1 if ratio < 0.1 else ratio
    target_batch_size = int(ratio * args.batch_size)
    target_batch_size = 1 if target_batch_size < 1 else target_batch_size
    src_batch_size = args.batch_size - target_batch_size
    logger.info("\nsrc_batch_size: {}\ntarget_batch_size: {}".format(
        src_batch_size, target_batch_size))
    assert target_batch_size >= 0

    model = Model(args,
                  ntag,
                  nword,
                  ntarwords=ntarword,
                  src_embedding=src_embedding,
                  target_embedding=target_embedding,
                  logger=logger,
                  src_batch_size=src_batch_size)

    model.build()
    try:
        print("========If !!! it's debugging!==========")
        print(args.debug)
        if args.debug:
            print("========it's debugging!==========")
            model.train(src_dev, src_dev, vocab_tag, target_dev, target_dev,
                        target_test, src_batch_size, target_batch_size)
        else:
            # model.train(src_train, src_dev, vocab_tag, target_train, target_dev, src_batch_size, target_batch_size)
            model.train(src_train, src_dev, vocab_tag, target_train,
                        target_dev, target_test, src_batch_size,
                        target_batch_size)
    except KeyboardInterrupt:
        model.evaluate(target_dev, vocab_tag, target='target')
예제 #23
0
def main():

    # Read datasets
    data = Dataset(args.DATA_DIR)
    sents, tags = data.get_all_data()

    # Construct the model
    MyModel = BiLSTMModel(args.MAX_SEQ_LEN, args.EMBEDDING,
                          args.LSTM_HIDDEN_UNITS, args.LSTM_DENSE_DIM,
                          data.get_nwords(), data.get_ntags())
    model = MyModel.define_model()

    num_train_sents = len(data.train_sents)
    num_val_sents = len(data.val_sents)
    num_test_sents = len(data.test_sents)

    print(
        "# train sents = {0} \n # of val sents = {1} \n # of test sents = {2}".
        format(num_train_sents, num_val_sents, num_test_sents),
        flush=True)

    # indexes to train, val and test data
    partition = {
        "train": list(range(num_train_sents)),
        "val": list(range(num_val_sents)),
        "test": list(range(num_test_sents))
    }

    # Parameters
    params = {
        'dim': args.MAX_SEQ_LEN,
        'batch_size': args.BATCH_SIZE,
        'n_classes': data.get_ntags(),
        'shuffle': True,
        'word2idx': data.get_word2idx(),
        'tag2idx': data.get_tag2idx()
    }

    # Generators
    training_generator = DG.DataGenerator(partition['train'], data.train_sents,
                                          data.train_tags, **params)
    validation_generator = DG.DataGenerator(partition['val'], data.val_sents,
                                            data.val_tags, **params)

    # Train model on dataset
    history = model.fit_generator(generator=training_generator,
                                  validation_data=validation_generator,
                                  use_multiprocessing=True,
                                  epochs=args.NUM_EPOCHS,
                                  verbose=1)

    # Parameters
    params_test = {
        'dim': args.MAX_SEQ_LEN,
        'batch_size': 1,
        'n_classes': data.get_ntags(),
        'shuffle': False,
        'word2idx': data.get_word2idx(),
        'tag2idx': data.get_tag2idx()
    }

    # Make predictions
    testing_generator = DG.DataGenerator(partition['test'], data.test_sents,
                                         data.train_tags, **params_test)

    pred_test = model.predict_generator(generator=testing_generator,
                                        steps=num_test_sents)
    pred_test = np.argmax(pred_test, axis=-1)

    # print(pred_test.shape)

    def pad(x):
        x1 = [
            tgs + ([data.get_tag2idx()["PAD"]] * (args.MAX_SEQ_LEN - len(tgs)))
            for tgs in x
        ]
        x2 = [tgs[:args.MAX_SEQ_LEN] for tgs in x1]
        return np.array(x2)

    test_tags_padded = pad(data.test_tags)

    # print(test_tags_padded.shape)

    def get_measures(yTrue, yPred):
        y1 = yTrue.reshape(1, -1).squeeze()
        y2 = yPred.reshape(1, -1).squeeze()

        P = precision_score(y1, y2, average=None)
        R = recall_score(y1, y2, average=None)
        F1 = f1_score(y1, y2, average=None)

        print("Precision=", flush=True)
        print(P, flush=True)
        print("Recall=", flush=True)
        print(R, flush=True)
        print("F1 score=", flush=True)
        print(F1, flush=True)

    print("Test...", flush=True)
    get_measures(test_tags_padded, pred_test)
예제 #24
0
				f.write('\n')
		
		avg_loss = dev_loss/len(data)
		print(avg_loss)
		f.close()
		return invalid



data = load_data(data_dir + '/msmarco/' + args.data +'_v2.1.json')

params = {'batch_size': 256,
          'shuffle': False,
          'num_workers': 16}

dev_set = Dataset(data, max_plen, max_qlen, glove_vec_size, data_dir)
dev_generator = utils.data.DataLoader(dev_set, **params)

device = torch.device('cpu')

cuda = torch.cuda.is_available()
if(cuda):
	device = torch.device('cuda')

config = Config(glove_vec_size, hidden_size, max_plen, max_qlen, num_para, device)


model = Model(config)
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

if(cuda):
예제 #25
0
파일: train.py 프로젝트: omareqbal/BTP
		}, 'checkpoints/saved_model.pth')



train_data = load_data('preprocessed_data/train_data.json', thres, max_plen)[0:100000]

print('Done loading Training data.')


train_params = {'batch_size': 32,
          'shuffle': True,
          'num_workers': 32,
	  	  'pin_memory': True}


training_set = Dataset(train_data, max_plen, max_qlen, data_dir, glove_vec_size)
training_generator = DataLoader(training_set, **train_params)

cuda = torch.cuda.is_available()

device = torch.device('cpu')
if(cuda):
	device = torch.device('cuda')

config = Config(glove_vec_size, elmo_options, elmo_weights, elmo_emb_size, hidden_size, max_plen, max_qlen, num_para, device)
model = Model(config)

if(cuda):
	model = model.to(device)

optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
예제 #26
0
    plt.axis('off')


netG = Generator(name="dcgan_g_html")
netD = Discriminator(name="dcgan_d_html")

loss = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss()

real_label = nd.ones((batch_size, ), ctx=ctx)
fake_label = nd.zeros((batch_size, ), ctx=ctx)

img_list = [
    os.path.join(data_path, x) for x in os.listdir(data_path)
    if x.endswith('png')
]
train_data = Dataset(img_list, img_dims, batch_size=batch_size)


def init_params():
    netG.initialize(mx.init.Normal(0.02), ctx=ctx)
    netD.initialize(mx.init.Normal(0.02), ctx=ctx)


def load_weights():
    netG.load_params(ctx=ctx)
    netD.load_params(ctx=ctx)


def init_optimizers():
    trainerG = mx.gluon.Trainer(netG.collect_params(), 'adam', {
        'learning_rate': lr,
예제 #27
0
from data_utils import read_dictionary, Dataset, vocab_tags
from general_utils import get_logger
from model import Model
from config import Config
import os
import sys

if not os.path.exists(Config.output_path):
    os.makedirs(Config.output_path)

# vocab_words = load_vocab(Config.words_vocab)
# vocab_tags = load_vocab(Config.tags_vocab)
vocab_words = read_dictionary(Config.words_vocab)

# print(vocab_words)
# print(vocab_tags)
# sys.exit(0)

test = Dataset(Config.test_path, Config.test_tgt_path, Config.max_iter)
train = Dataset(Config.source_path, Config.source_tgt_path, Config.max_iter)

logger = get_logger(Config.log_path)

model = Model(Config,
              ntags=len(vocab_tags),
              n_words=len(vocab_words),
              logger=logger)

model.build()

model.train(train, test, vocab_tags, vocab_words)
예제 #28
0
def main(_):

    vocab = Vocabulary.from_file(
        os.path.join(FLAGS.datadir, "1b_word_vocab.txt"))
    dataset = Dataset(
        vocab,
        os.path.join(FLAGS.datadir,
                     "training-monolingual.tokenized.shuffled/*"))

    single_gpu_graph = tf.Graph()
    with single_gpu_graph.as_default():
        with tf.variable_scope("model"):
            model = language_model_graph.build_model()

    def run(sess, num_workers, worker_id, num_replicas_per_worker):

        state_c = []
        state_h = []

        if len(state_c) == 0:
            state_c.extend([
                np.zeros([FLAGS.batch_size, model.state_size],
                         dtype=np.float32)
                for _ in range(num_replicas_per_worker)
            ])
            state_h.extend([
                np.zeros([FLAGS.batch_size, model.projected_size],
                         dtype=np.float32)
                for _ in range(num_replicas_per_worker)
            ])

        prev_global_step = sess.run(model.global_step)[0]
        prev_time = time.time()
        data_iterator = dataset.iterate_forever(
            FLAGS.batch_size * num_replicas_per_worker, FLAGS.num_steps,
            num_workers, worker_id)
        fetches = {
            'global_step': model.global_step,
            'loss': model.loss,
            'train_op': model.train_op,
            'final_state_c': model.final_state_c,
            'final_state_h': model.final_state_h
        }

        for local_step in range(FLAGS.max_steps):
            if FLAGS.use_synthetic:
                x = np.random.randint(
                    low=0,
                    high=model.vocab_size,
                    size=(FLAGS.batch_size * num_replicas_per_worker,
                          FLAGS.num_steps))
                y = np.random.randint(
                    low=0,
                    high=model.vocab_size,
                    size=(FLAGS.batch_size * num_replicas_per_worker,
                          FLAGS.num_steps))
                w = np.ones((FLAGS.batch_size * num_replicas_per_worker,
                             FLAGS.num_steps))
            else:
                x, y, w = next(data_iterator)
            feeds = {}
            feeds[model.x] = np.split(x, num_replicas_per_worker)
            feeds[model.y] = np.split(y, num_replicas_per_worker)
            feeds[model.w] = np.split(w, num_replicas_per_worker)
            feeds[model.initial_state_c] = state_c
            feeds[model.initial_state_h] = state_h
            fetched = sess.run(fetches, feeds)

            state_c = fetched['final_state_c']
            state_h = fetched['final_state_h']

            if local_step % FLAGS.log_frequency == 0:
                cur_time = time.time()
                elapsed_time = cur_time - prev_time
                num_words = FLAGS.batch_size * FLAGS.num_steps
                wps = (fetched['global_step'][0] -
                       prev_global_step) * num_words / elapsed_time
                prev_global_step = fetched['global_step'][0]
                parallax.log.info(
                    "Iteration %d, time = %.2fs, wps = %.0f, train loss = %.4f"
                    % (fetched['global_step'][0], cur_time - prev_time, wps,
                       fetched['loss'][0]))
                prev_time = cur_time

    sess, num_workers, worker_id, num_replicas_per_worker = \
        parallax.parallel_run(single_gpu_graph,
                              FLAGS.resource_info_file,
                              sync=FLAGS.sync,
                              parallax_config=parallax_config.build_config())
    run(sess, num_workers, worker_id, num_replicas_per_worker)
예제 #29
0
        return results_sel_para, results_pred_start, results_pred_end


dev_data = load_data('preprocessed_data/dev_data.json', thres,
                     max_plen)[0:25000]

print('Done loading dev data.')

params = {
    'batch_size': 32,
    'shuffle': False,
    'num_workers': 32,
    'pin_memory': True
}

dev_set = Dataset(dev_data, max_plen, max_qlen, data_dir, glove_vec_size)
dev_generator = DataLoader(dev_set, **params)

config = Config(glove_vec_size, elmo_options, elmo_weights, elmo_emb_size,
                hidden_size, max_plen, max_qlen, num_para, device)
model = Model(config)
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

if (cuda):
    model = model.to(device)

checkpoint = torch.load('checkpoints/saved_model.pth')

model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
예제 #30
0
num_cates = max(ent2idx.values()) + 1
sent_len = 64
vocab_size = 2320
emb_size = 256
sent_pad = 10
seq_len = sent_len + 2 * sent_pad

test_data_dir = '../data/chusai_xuanshou'
test_docs = Documents(data_dir=test_data_dir)
sent_extrator = SentenceExtractor(window_size=sent_len, pad_size=sent_pad)
test_sents = sent_extrator(test_docs)

with open('word2idx.json', 'r') as f:
    word2idx = eval(f.read())

test_data = Dataset(test_sents, word2idx=word2idx, cate2idx=ent2idx)
test_X, _ = test_data[:]

print(len(test_docs))

w2v_embeddings = np.load('w2v_embeddings.npy')

model = build_lstm_crf_model(num_cates,
                             seq_len=seq_len,
                             vocab_size=vocab_size,
                             model_opts={
                                 'emb_matrix': w2v_embeddings,
                                 'emb_size': emb_size,
                                 'emb_trainable': False
                             })
model.load_weights(