Exemplo n.º 1
0
def train_loop(net,
               dname,
               data_dir,
               epochs=90,
               workers=4,
               resume='',
               savedir='./',
               save_all_epochs=False,
               q_nograd_its=0,
               batch_size=256):
    mkdir(savedir)
    global best_err1

    # Load data here:
    _, train_loader, val_loader, _, _, Ntrain = \
        get_image_loader(dname, batch_size, cuda=True, workers=workers, distributed=False, data_dir=data_dir)

    net.N_train = Ntrain

    start_epoch = 0

    marginal_loglike = np.zeros(epochs)
    train_loss = np.zeros(epochs)
    dev_loss = np.zeros(epochs)

    err_train = np.zeros(epochs)
    err_dev = np.zeros(epochs)

    # optionally resume from a checkpoint
    if resume:
        if os.path.isfile(resume):
            print("=> loading checkpoint '{}'".format(resume))
            start_epoch, best_err1 = net.load(resume)
            print("=> loaded checkpoint '{}' (epoch {})".format(
                resume, start_epoch))
        else:
            print("=> no checkpoint found at '{}'".format(resume))

        candidate_progress_file = resume.split('/')
        candidate_progress_file = '/'.join(
            candidate_progress_file[:-1]) + '/stats_array.pkl'

        if os.path.isfile(candidate_progress_file):
            print("=> found progress file at '{}'".format(
                candidate_progress_file))
            try:
                marginal_loglike, err_train, train_loss, err_dev, dev_loss = \
                    load_object(candidate_progress_file)
                print("=> Loaded progress file at '{}'".format(
                    candidate_progress_file))
            except Exception:
                print("=> Unable to load progress file at '{}'".format(
                    candidate_progress_file))
        else:
            print("=> NOT found progress file at '{}'".format(
                candidate_progress_file))

    if q_nograd_its > 0:
        net.prob_model.q_logits.requires_grad = False

    for epoch in range(start_epoch, epochs):
        if q_nograd_its > 0 and epoch == q_nograd_its:
            net.prob_model.q_logits.requires_grad = True

        tic = time.time()
        nb_samples = 0
        for x, y in train_loader:
            marg_loglike_estimate, minus_loglike, err = net.fit(x, y)

            marginal_loglike[epoch] += marg_loglike_estimate * x.shape[0]
            err_train[epoch] += err * x.shape[0]
            train_loss[epoch] += minus_loglike * x.shape[0]
            nb_samples += len(x)

        marginal_loglike[epoch] /= nb_samples
        train_loss[epoch] /= nb_samples
        err_train[epoch] /= nb_samples

        toc = time.time()

        # ---- print
        print('\n depth approx posterior',
              net.prob_model.current_posterior.data.cpu().numpy())
        print(
            "it %d/%d, ELBO/evidence %.4f, pred minus loglike = %f, err = %f" %
            (epoch, epochs, marginal_loglike[epoch], train_loss[epoch],
             err_train[epoch]),
            end="")
        cprint('r', '   time: %f seconds\n' % (toc - tic))

        net.update_lr()

        # ---- dev
        tic = time.time()
        nb_samples = 0
        for x, y in val_loader:
            minus_loglike, err = net.eval(x, y)

            dev_loss[epoch] += minus_loglike * x.shape[0]
            err_dev[epoch] += err * x.shape[0]
            nb_samples += len(x)

        dev_loss[epoch] /= nb_samples
        err_dev[epoch] /= nb_samples

        toc = time.time()

        cprint('g',
               '     pred minus loglike = %f, err = %f\n' %
               (dev_loss[epoch], err_dev[epoch]),
               end="")
        cprint('g', '    time: %f seconds\n' % (toc - tic))

        filename = 'checkpoint.pth.tar'
        if save_all_epochs:
            filename = str(epoch) + '_' + filename
        net.save(os.path.join(savedir, filename), best_err1)
        if err_dev[epoch] < best_err1:
            best_err1 = err_dev[epoch]
            cprint('b', 'best top1 dev err: %f' % err_dev[epoch])
            shutil.copyfile(os.path.join(savedir, filename),
                            os.path.join(savedir, 'model_best.pth.tar'))

        all_results = [
            marginal_loglike, err_train, train_loss, err_dev, dev_loss
        ]
        save_object(all_results, os.path.join(savedir, 'stats_array.pkl'))
Exemplo n.º 2
0
    def __init__(self, w2v_path, save_dir='models/recommender', courses_path='data/processed/grouped_courses.json'):
        """Load, preprocess and precompute word and course vectors. Load vectors if precomputed are available.

        Args:
            w2v_path (str): Google News Word2Vec word embeddings path.
            save_dir (str): Directory path to persist precomputed vectors.
            courses_path (str): Courses json path, grouped by category {category: Course}.

        """
        self.logger = logging.getLogger(APP_NAME + ".Recommender")
        self.logger.info("Loading word2vec embeddings")
        self.word2id, self.id2word, self.word_embeddings = load_word2vec(w2v_path, 1000000)
        self.logger.info("Loaded %d word vectors with dim=%d" % self.word_embeddings.shape)
        self.stopwords = stopwords.words("english")

        course2id_path = os.path.join(save_dir, 'course2id.pkl')
        id2course_path = os.path.join(save_dir, 'id2course.pkl')
        category2id_path = os.path.join(save_dir, 'category2id.pkl')
        id2category_path = os.path.join(save_dir, 'id2category.pkl')
        category2courses_path = os.path.join(save_dir, 'category2courses.pkl')
        course_embeddings_path = os.path.join(save_dir, 'course_embeddings.pkl')
        category_embeddings_path = os.path.join(save_dir, 'category_embeddings.pkl')
        if os.path.exists(save_dir):
            self.logger.info("Loading course embeddings")
            self.course2id = load_object(course2id_path)
            self.id2course = load_object(id2course_path)
            self.category2id = load_object(category2id_path)
            self.id2category = load_object(id2category_path)
            self.category2courses = load_object(category2courses_path)
            self.course_embeddings = load_object(course_embeddings_path)
            self.category_embeddings = load_object(category_embeddings_path)
        else:
            os.makedirs(save_dir)
            self.logger.info("Course embeddings not found, building")
            self.course2id, self.id2course, self.category2id, self.id2category, self.category2courses, \
                self.course_embeddings, self.category_embeddings = self._prepare_courses(courses_path)
            save_object(self.course2id, course2id_path)
            save_object(self.id2course, id2course_path)
            save_object(self.category2id, category2id_path)
            save_object(self.id2category, id2category_path)
            save_object(self.category2courses, category2courses_path)
            save_object(self.course_embeddings, course_embeddings_path)
            save_object(self.category_embeddings, category_embeddings_path)
Exemplo n.º 3
0
    def train_machines_multiple_dfs_new(self, _labels, _experiment_output_name='demo',_max_iter=20, _prediction_path=None, _print=False,  _test_data=None, _test_labels=None, _uniformly=False):
        """ Train the PFSMs given a set of dataframes and their labels

        :param _labels: column types labeled by hand, where _label[i][j] denotes the type of j^th column in i^th dataframe.
        :param _experiment_output_name:
        :param _max_iter: the maximum number of iterations the optimization algorithm runs as long as it's not converged.
        :param _prediction_path:
        :param _print:
        :param _test_data:
        :param _test_labels:
        :param _uniformly: a binary variable used to initialize the PFSMs - True allows initializing uniformly rather than using hand-crafted values.
        :return:
        """
        self.print = _print
        self.prediction_path = _prediction_path
        self.experiment_output_name = _experiment_output_name

        if _uniformly:
            self.initialize_params_uniformly()

        # Setup folders and probabilities for all columns
        self.normalize_params()

        # Changing column names
        self.data_frames = [data_frame.rename(columns=lambda n: str(n).replace(' ', '')) for data_frame in self.data_frames]
        self.model.data_frames = self.data_frames

        # find the unique values in all of the columns once
        for i, df in enumerate(self.model.data_frames):
            if i == 0:
                unique_vals = np.unique(df.values)
            else:
                unique_vals = np.concatenate((unique_vals, np.unique(df.values)))
        self.model.unique_vals = unique_vals

        self.PFSMRunner.set_unique_values(unique_vals)

        # Finding unique values and their counts
        self.model.dfs_unique_vals_counts = {}
        for i, df in enumerate(self.data_frames):
            df_unique_vals_counts = {}
            for column_name in list(df.columns):
                temp_x, counts = np.unique([str(int_element) for int_element in df[column_name].tolist()], return_counts=True)
                counts = {u_data: c for u_data, c in zip(temp_x, counts)}
                temp_counts = list(counts.values())
                counts_array = np.reshape(temp_counts, newshape=(len(temp_counts),))
                df_unique_vals_counts[column_name] = [temp_x, counts_array]
            self.model.dfs_unique_vals_counts[str(i)] = df_unique_vals_counts

        # Setting
        self.model.labels = _labels
        self.model.types = self.types
        self.model.J = len(self.PFSMRunner.machines)  # J: num of data types including missing and anomaly.
        self.model.K = self.model.J - 2  # K: num of possible column data types (excluding missing and anomaly)
        self.model.pi = [self.model.PI for j in range(self.model.K)]  # mixture weights of row types
        self.model.current_runner = self.PFSMRunner

        training_error = []
        training_error.append(self.calculate_error_df(self.data_frames, _labels))

        save_object(self.PFSMRunner, self.experiment_output_name + '_training_runner_initial.pkl')
        print(training_error)

        # Iterates over whole data points
        for it in range(_max_iter):
            print_to_file('iteration = ' + str(it), filename=self.experiment_output_name + '_output.txt')

            # Trains machines using all of the training data frames
            self.PFSMRunner = self.train_all_models_multiple_dfs(self.PFSMRunner)
            self.model.current_runner = self.PFSMRunner

            # Calculate training and validation error at each iteration
            training_error.append(self.calculate_error_df(self.data_frames, _labels))
            print(training_error)

            if it > 0:
                if (training_error[-2] - training_error[-1] < 1e-2):
                    print_to_file('converged!', filename=self.experiment_output_name + '_output.txt')
                    save_object(self.PFSMRunner, self.experiment_output_name + '_training_runner' + str(it) + '.pkl')
                    break

            save_object(self.PFSMRunner, self.experiment_output_name + '_training_runner' + str(it) + '.pkl')
        save_object(training_error, self.experiment_output_name + '_training_error.pkl')
Exemplo n.º 4
0
 def save_posteriors(self, filename='all_posteriors.pkl'):
     save_object(self.all_posteriors, filename)
Exemplo n.º 5
0
def train_loop(model, dname, data_dir, epochs=90, workers=4, gpu=None, resume='', weight_decay=1e-4,
               savedir='./', milestones=None, MC_samples=1, batch_size=256):
    mkdir(savedir)
    global best_acc1

    if gpu is not None:
        print("Use GPU: {} for training".format(gpu))

    if gpu is not None:  # Check for single GPU
        torch.cuda.set_device(gpu)
        model = model.cuda(gpu)
    else:
        # # DataParallel will divide and allocate batch_size to all available GPUs
        model = torch.nn.DataParallel(model).cuda()

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss(reduction='mean').cuda(gpu)

    optimizer = torch.optim.SGD(model.parameters(), lr,
                                momentum=momentum,
                                weight_decay=weight_decay)
    if milestones is None:  # if milestones are not specified, set to impossible value so LR is never decayed.
        milestones = [epochs + 1]
    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=milestones, gamma=0.1)

    tr_acc1_vec = []
    tr_acc5_vec = []
    tr_loss_vec = []
    acc1_vec = []
    acc5_vec = []
    loss_vec = []

    start_epoch = 0
    # optionally resume from a checkpoint
    if resume:
        if os.path.isfile(resume):
            print("=> loading checkpoint '{}'".format(resume))
            if gpu is None:
                checkpoint = torch.load(resume)
            else:
                # Map model to be loaded to specified single gpu.
                loc = 'cuda:{}'.format(gpu)
                checkpoint = torch.load(resume, map_location=loc)
            start_epoch = checkpoint['epoch']
            best_acc1 = checkpoint['best_acc1']
            if gpu is not None:
                # best_acc1 may be from a checkpoint from a different GPU
                best_acc1 = best_acc1.to(gpu)
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            scheduler.load_state_dict(checkpoint['scheduler'])
            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(resume))

        candidate_progress_file = resume.split('/')
        candidate_progress_file = '/'.join(candidate_progress_file[:-1]) + '/stats_array.pkl'

        if os.path.isfile(candidate_progress_file):
            print("=> found progress file at '{}'".format(candidate_progress_file))
            try:
                tr_acc1_vec, tr_acc5_vec, tr_loss_vec, acc1_vec, acc5_vec, loss_vec = \
                    load_object(candidate_progress_file)
                print("=> Loaded progress file at '{}'".format(candidate_progress_file))
            except Exception:
                print("=> Unable to load progress file at '{}'".format(candidate_progress_file))
        else:
            print("=> NOT found progress file at '{}'".format(candidate_progress_file))

    cudnn.benchmark = True

    _, train_loader, val_loader, _, _, _ = \
        get_image_loader(dname, batch_size, cuda=True, workers=workers, distributed=False, data_dir=data_dir)

    for epoch in range(start_epoch, epochs):

        # train for one epoch and update lr scheduler setting
        tr_acc1, tr_acc5, tr_loss = train(train_loader, model, criterion, optimizer, epoch, gpu)
        print('used lr: %f' % optimizer.param_groups[0]["lr"])
        scheduler.step()

        tr_acc1_vec.append(tr_acc1)
        tr_acc5_vec.append(tr_acc5)
        tr_loss_vec.append(tr_loss)

        # evaluate on validation set
        acc1, acc5, loss = validate(val_loader, model, criterion, gpu, MC_samples=MC_samples)

        acc1_vec.append(acc1)
        acc5_vec.append(acc5)
        loss_vec.append(loss)

        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)

        save_checkpoint({
            'epoch': epoch + 1,
            'state_dict': model.state_dict(),
            'best_acc1': best_acc1,
            'optimizer': optimizer.state_dict(),
            'scheduler': scheduler.state_dict(),
        }, is_best, savedir=savedir)

        all_results = [tr_acc1_vec, tr_acc5_vec, tr_loss_vec, acc1_vec, acc5_vec, loss_vec]
        save_object(all_results, os.path.join(savedir, 'stats_array.pkl'))