Пример #1
0
def main():
    """Fine-tune bert"""

    train_data = dtrdata.DTRData(os.path.join(base, cfg.get('data',
                                                            'xmi_dir')),
                                 partition='train',
                                 n_files=cfg.get('data', 'n_files'))
    tr_texts, tr_labels = train_data.read()
    train_loader = utils.make_data_loader(tr_texts, tr_labels,
                                          cfg.getint('model', 'batch_size'),
                                          None, 'train', utils.to_lstm_inputs)

    val_data = dtrdata.DTRData(os.path.join(base, cfg.get('data', 'xmi_dir')),
                               partition='dev',
                               n_files=cfg.get('data', 'n_files'))
    val_texts, val_labels = val_data.read()
    val_loader = utils.make_data_loader(val_texts, val_labels,
                                        cfg.getint('model', 'batch_size'),
                                        None, 'dev', utils.to_lstm_inputs)

    model = LstmClassifier()

    label_counts = torch.bincount(torch.IntTensor(tr_labels))
    weights = len(tr_labels) / (2.0 * label_counts)

    train(model, train_loader, val_loader, weights)
    evaluate(model, val_loader, weights)
Пример #2
0
def main():
    """Fine-tune bert"""

    train_data = reldata.RelData(os.path.join(base, cfg.get('data',
                                                            'xmi_dir')),
                                 partition='train',
                                 n_files=cfg.get('data', 'n_files'))
    tr_texts, tr_labels = train_data.event_time_relations()
    train_loader = utils.make_data_loader(tr_texts, tr_labels,
                                          cfg.getint('model', 'batch_size'),
                                          cfg.getint('data', 'max_len'),
                                          'train', utils.to_transformer_inputs)

    val_data = reldata.RelData(os.path.join(base, cfg.get('data', 'xmi_dir')),
                               partition='dev',
                               n_files=cfg.get('data', 'n_files'))
    val_texts, val_labels = val_data.event_time_relations()
    val_loader = utils.make_data_loader(val_texts, val_labels,
                                        cfg.getint('model', 'batch_size'),
                                        cfg.getint('data', 'max_len'), 'dev',
                                        utils.to_transformer_inputs)

    print('loaded %d training and %d validation samples' % \
          (len(tr_texts), len(val_texts)))

    model = TransformerClassifier()

    label_counts = torch.bincount(torch.IntTensor(tr_labels))
    weights = len(tr_labels) / (2.0 * label_counts)

    train(model, train_loader, val_loader, weights)
    evaluate(model, val_loader, weights, suppress_output=False)
Пример #3
0
def main():
    """Fine-tune bert"""

    train_data = dtrdata.DTRData(os.path.join(base, cfg.get('data',
                                                            'xmi_dir')),
                                 partition='train',
                                 n_files=cfg.get('data', 'n_files'))
    tr_texts, tr_labels = train_data.read()
    train_loader = utils.make_data_loader(tr_texts, tr_labels,
                                          cfg.getint('model', 'batch_size'),
                                          cfg.getint('data', 'max_len'),
                                          'train', utils.to_token_id_sequences)

    val_data = dtrdata.DTRData(os.path.join(base, cfg.get('data', 'xmi_dir')),
                               partition='dev',
                               n_files=cfg.get('data', 'n_files'))
    val_texts, val_labels = val_data.read()
    val_loader = utils.make_data_loader(val_texts, val_labels,
                                        cfg.getint('model', 'batch_size'),
                                        cfg.getint('data', 'max_len'), 'dev',
                                        utils.to_token_id_sequences)

    model = BagOfEmbeddings()

    label_counts = torch.bincount(torch.IntTensor(tr_labels))
    weights = len(tr_labels) / (2.0 * label_counts)

    train(model, train_loader, val_loader, weights)
    evaluate(model, val_loader, weights)
Пример #4
0
    def track_loss(self, relabel, plot=False, multi=False, feats=False):
        self.model.eval()
        acc = 0
        total_sum = 0
        with torch.no_grad():
            nargs = copy.deepcopy(self.args)
            loader, _ = make_data_loader(nargs, no_aug=True, **self.kwargs)
            loader.dataset.targets = relabel  #unshuffled original label guess

            tbar = tqdm(loader)
            tbar.set_description("Tracking loss")

            features = torch.tensor([])
            losses = torch.tensor([])
            for i, sample in enumerate(tbar):
                image, target, ids = sample["image"], sample["target"], sample[
                    "index"]
                if self.args.cuda:
                    target, image = target.cuda(), image.cuda()
                outputs, feat = self.model(image)
                features = torch.cat((features, feat.cpu()))

                loss = multi_class_loss(outputs, target)

                losses = torch.cat((losses, loss.detach().cpu()))

            losses = losses.view(-1)

            if feats:
                return losses, features
            return losses
Пример #5
0
    def __init__(self, args):
        self.args = args

        kwargs = {"num_classes": self.args.num_class}

        if args.net == "resnet18":
            from nets.resnet import ResNet18
            model = ResNet18(pretrained=(args.load == 'imagenet'), **kwargs)
        elif args.net == "resnet50":
            from nets.resnet import ResNet50
            model = ResNet50(pretrained=(args.load == 'imagenet'), **kwargs)
        elif args.net == "wideresnet282":
            from nets.wideresnet import WRN28_2
            model = WRN28_2(**kwargs)
        else:
            raise NotImplementedError

        print("Number of parameters",
              sum(p.numel() for p in model.parameters() if p.requires_grad))

        self.model = nn.DataParallel(model).cuda()

        self.optimizer = torch.optim.SGD(self.model.parameters(),
                                         lr=self.args.lr,
                                         momentum=0.9,
                                         nesterov=True,
                                         weight_decay=5e-4)
        self.criterion = nn.CrossEntropyLoss(ignore_index=-1)
        self.criterion_nored = nn.CrossEntropyLoss(reduction="none")

        self.kwargs = {"num_workers": 12, "pin_memory": False}
        self.train_loader, self.val_loader = make_data_loader(
            args, **self.kwargs)

        self.best = 0
        self.best_epoch = 0
        self.acc = []
        self.train_acc = []
        self.med_clean = []
        self.med_noisy = []
        self.perc_clean = []
        self.perc_noisy = []

        self.reductor_plot = umap.UMAP(n_components=2)

        self.toPIL = torchvision.transforms.ToPILImage()

        self.unorm = UnNormalize(mean=(0.485, 0.456, 0.406),
                                 std=(0.229, 0.224, 0.225))
Пример #6
0
    def label_propagation(self, plot=False, diffuse=False):
        self.model.eval()
        with torch.no_grad():
            transform = None
            if self.args.load == "imagenet":
                transform = torchvision.transforms.Resize(
                    224)  #Was trained at a different resolution than cifar
            loader, _ = make_data_loader(self.args,
                                         no_aug=True,
                                         transform=transform,
                                         **self.kwargs)
            dim = 2048 if self.args.net == "resnet50" else 512
            dim = dim if self.args.net != "wideresnet282" else 128
            features_average = torch.zeros((len(loader.dataset), dim))

            features = torch.tensor([])
            tbar = tqdm(loader)
            for i, sample in enumerate(tbar):
                image, target, ids = sample["image"], sample["target"], sample[
                    "index"]
                if self.args.cuda:
                    target, image = target, image.cuda()
                outputs, feat = self.model(image)
                features = torch.cat((features, feat.cpu()))
            features_average += features
            torch.cuda.empty_cache()
        features_average = features_average
        targ = torch.tensor(loader.dataset.targets)
        features = features_average.numpy()

        #Normalize the features + PCA whitening
        faiss.normalize_L2(features)
        pca = PCA(whiten=True, n_components=features.shape[1])
        features = pca.fit_transform(features)
        features = np.ascontiguousarray(features)

        labels = -torch.ones(targ.shape[0])

        for i, ii in enumerate(self.indicies):
            labels[ii] = targ[ii]  #known samples

        if diffuse:  #Diffusion
            final_labels = torch.zeros(targ.shape[0], self.args.num_class)
            weights = torch.zeros(targ.shape[0])
            p_labels, p_weights, class_weights = diffusion(
                features,
                labels.clone(),
                self.indicies,
                k=200,
                max_iter=50,
                classes=self.args.num_class)
            p_labels = torch.from_numpy(p_labels).float()
            p_weights = torch.from_numpy(p_weights).float()
        else:  #KNN
            index = faiss.IndexFlatIP(features.shape[1])
            index.add(features[self.indicies])
            _, I = index.search(features, 1)
            p_labels = labels[self.indicies[I]]
            p_weights = torch.ones(features.shape[0])

        if plot is not None:  #Optional UMap plots
            embedding = self.reductor_plot.fit_transform(features)
            emb = embedding[self.indicies]  #Centroids, at least one per class
            plt.figure(7)
            plt.scatter(embedding[:, 0],
                        embedding[:, 1],
                        c=[
                            sns.color_palette(n_colors=self.args.num_class)[x]
                            for x in targ
                        ],
                        s=0.1)
            plt.scatter(emb[:, 0],
                        emb[:, 1],
                        c=[
                            sns.color_palette(n_colors=self.args.num_class)[x]
                            for x in targ[self.indicies]
                        ],
                        marker="*")
            plt.scatter(emb[:, 0], emb[:, 1], c="#000000", marker="*", s=1.1)
            plt.savefig("data/embedding{}.png".format(plot))

            df = pd.DataFrame(embedding, columns=["x", "y"])
            sns_plot = sns.jointplot(x="x", y="y", data=df, kind="kde")
            sns_plot.savefig("data/embedding_density{}.png".format(plot))

            plt.figure(6)
            plt.scatter(embedding[:, 0],
                        embedding[:, 1],
                        c=[
                            sns.color_palette(n_colors=self.args.num_class)[x]
                            for x in torch.argmax(p_labels, dim=1)
                        ],
                        s=0.1)
            plt.scatter(
                emb[:, 0],
                emb[:, 1],
                c=[
                    sns.color_palette(n_colors=self.args.num_class)[x]
                    for x in torch.argmax(p_labels[self.indicies], dim=1)
                ],
                marker="*")
            plt.scatter(emb[:, 0], emb[:, 1], c="#000000", marker="*", s=1.1)
            plt.savefig("data/embedding_diffusion{}.png".format(plot))
            plt.close()

        if diffuse:
            labels = torch.zeros(features.shape[0], self.args.num_class)
            for i, p in enumerate(torch.argmax(p_labels, 1)):
                labels[i][p.item()] = 1
        else:
            labels = torch.zeros(features.shape[0], self.args.num_class)
            for i, p in enumerate(p_labels.long()):
                labels[i][p] = 1
            p_labels = labels

        del features
        torch.cuda.empty_cache()
        return p_labels, p_weights
Пример #7
0
def main(args):
    """ Train collaborative filtering """
    # Clean slate
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    # Load data
    R = load_data(args.dataset_str)

    # Split into training/test
    n_users, n_items = R.shape
    R_train, R_test = train_test_split(R, test_size=args.test_size)

    # Make data handlers
    train_loader = make_data_loader(
        R_train,
        batch_size=args.batch_size,
    )
    test_loader = make_data_loader(
        R_test,
        batch_size=args.batch_size,
    )

    # Define model, loss, optimizer
    model = MatrixFactorization(n_users, n_items, n_factors=args.n_latent)
    loss_func = torch.nn.MSELoss()
    optimizer = torch.optim.SparseAdam(model.parameters(), lr=args.lr)

    for i in range(args.num_epochs):  # loop over the dataset multiple times
        train_loss = 0.0
        test_loss = 0.0

        for j, (data) in enumerate(train_loader):
            # Turn data into variables
            row, col, rating = data
            rating = Variable(rating.float())
            row = Variable(row.long())
            col = Variable(col.long())

            # Make prediction
            prediction = model(row, col)
            loss = loss_func(prediction, rating)

            # zero the parameter gradients
            optimizer.zero_grad()

            # Backpropagate
            loss.backward()

            # Update the parameters
            optimizer.step()

            # Save loss
            train_loss += loss.data[0]

        # -----------------------
        # Print every epoch
        for test_data in test_loader:
            row, col, rating = test_data
            rating = Variable(rating.float())
            row = Variable(row.long())
            col = Variable(col.long())

            prediction = model(row, col)
            loss = loss_func(prediction, rating)
            test_loss += loss.data[0]

        # print statistics
        print('epoch: {}, train_loss: {}, test_loss: {}'.format(
            i + 1, train_loss / len(train_loader),
            test_loss / len(test_loader)))

    print('Finished Training!')
Пример #8
0
from models import RNNEncoder, RNNDecoder, LinearClassifier, Seq2Seq, MultiSeq2Seq
from train_emotion import train_emotion
from train_seq2seq import eval_seq2seq, train_seq2seq
from train_multitask import train_multitask
from test import test
from utils import constant, DialogDataset, make_data_loader, collate_fn


if __name__ == "__main__":
    fasttext = True if constant.embedding == 'fasttext' else False
    aug = True if constant.parse == 'augment' else False
    sld = True if constant.parse == 'sliding' else False
    train_dataset = DialogDataset(mode='train', dataset=constant.data, sld=sld, aug=aug, path=None, load_fasttext=fasttext)
    dev_dataset = DialogDataset(mode='dev', dataset=constant.data, sld=False, aug=False, path=None, load_fasttext=fasttext)
    test_dataset = DialogDataset(mode='test', dataset=constant.data, sld=False, aug=False, path=None, load_fasttext=fasttext)
    train_dataloader = make_data_loader(train_dataset, constant.USE_CUDA, constant.embeddings_cpu, constant.B, full=constant.full, sld=sld, aug=aug, pad_idx=1, shuffle=constant.shuffle)
    dev_dataloader = make_data_loader(dev_dataset, constant.USE_CUDA, constant.embeddings_cpu, constant.B, full=constant.full, sld=sld, aug=aug, pad_idx=1, shuffle=constant.shuffle)
    test_dataloader = make_data_loader(test_dataset, constant.USE_CUDA, constant.embeddings_cpu, constant.B, full=constant.full, sld=sld, aug=aug, pad_idx=1, shuffle=constant.shuffle)
 
    print()
    for dialogs, lens, targets, target_lens, emotions in train_dataloader:
        print('train')
        print("dialogs: ", dialogs.shape)
        print("lens: ", lens.shape)
        print("targets: ", targets.shape)
        print("target_lens: ", target_lens.shape)
        print("emotions: ", emotions.shape)
        break

    print()
    for dialogs, lens, targets, target_lens, emotions in dev_dataloader:
Пример #9
0
def main():
    """Fine-tune bert"""

    #
    # split train into train and validation and evaluate
    #

    tr_texts, tr_labels = datareader.DirDataReader.read(
        os.path.join(base, cfg.get('data', 'train')), {
            'no': 0,
            'yes': 1
        })

    tr_texts, val_texts, tr_labels, val_labels = train_test_split(
        tr_texts, tr_labels, test_size=0.20, random_state=2020)

    tok = tokenizer.Tokenizer(cfg.getint('data', 'vocab_size'))
    tok.fit_on_texts(tr_texts)

    tr_texts = tok.texts_as_sets_to_seqs(tr_texts)
    val_texts = tok.texts_as_sets_to_seqs(val_texts)

    train_loader = utils.make_data_loader(tr_texts, tr_labels,
                                          cfg.getint('model', 'batch_size'),
                                          cfg.getint('data', 'max_len'),
                                          'train', utils.to_transformer_inputs)

    val_loader = utils.make_data_loader(val_texts, val_labels,
                                        cfg.getint('model', 'batch_size'),
                                        cfg.getint('data', 'max_len'), 'dev',
                                        utils.to_transformer_inputs)

    print('loaded %d training and %d validation samples' % \
          (len(tr_texts), len(val_texts)))

    model = TransformerClassifier()

    label_counts = torch.bincount(torch.IntTensor(tr_labels))
    weights = len(tr_labels) / (2.0 * label_counts)

    best_roc, optimal_epochs = fit(model, train_loader, val_loader, weights,
                                   cfg.getint('model', 'num_epochs'))
    print('roc auc %.3f after %d epochs' % (best_roc, optimal_epochs))

    #
    # now retrain and evaluate on test
    #

    tr_texts, tr_labels = datareader.DirDataReader.read(
        os.path.join(base, cfg.get('data', 'train')), {
            'no': 0,
            'yes': 1
        })

    test_texts, test_labels = datareader.DirDataReader.read(
        os.path.join(base, cfg.get('data', 'test')), {
            'no': 0,
            'yes': 1
        })

    tok = tokenizer.Tokenizer(cfg.getint('data', 'vocab_size'))
    tok.fit_on_texts(tr_texts)

    tr_texts = tok.texts_as_sets_to_seqs(tr_texts)
    test_texts = tok.texts_as_sets_to_seqs(test_texts)

    train_loader = utils.make_data_loader(tr_texts, tr_labels,
                                          cfg.getint('model', 'batch_size'),
                                          cfg.getint('data', 'max_len'),
                                          'train', utils.to_transformer_inputs)

    test_loader = utils.make_data_loader(test_texts, test_labels,
                                         cfg.getint('model', 'batch_size'),
                                         cfg.getint('data', 'max_len'), 'test',
                                         utils.to_transformer_inputs)

    print('loaded %d training and %d test samples' % \
          (len(tr_texts), len(test_texts)))

    model = TransformerClassifier()

    label_counts = torch.bincount(torch.IntTensor(tr_labels))
    weights = len(tr_labels) / (2.0 * label_counts)

    fit(model, train_loader, test_loader, weights, optimal_epochs)
Пример #10
0
def train_model(args):
    print("\nTraining model...")
    # check if results_dir arg is set
    if not args["results_dir"]:
        args["results_dir"] = "pipeline_files/" + args["run_name"] + "/models/"

    # get model name
    if args["model_dir_name"]:
        model_dir_path = args["results_dir"] + args["model_dir_name"]
    else:
        # make model directory name time/date if no name specified in args
        model_dir_path = args["results_dir"] + str(
            datetime.now().strftime('%m-%d_%H:%M/'))

    # add/update model_dir_path key in args dict
    args["model_dir_path"] = model_dir_path

    # setup directory to save models
    if not os.path.exists(model_dir_path):
        os.makedirs(model_dir_path)

    # save training details
    copy(args['config_file'], model_dir_path)

    # get cuda device
    device = torch.device(args["cuda_device"])

    # get training data path
    if args['training_data_path']:
        training_data_path = args['training_data_path']
    else:
        training_data_path = "pipeline_files/" + args[
            'run_name'] + "/data/train_data.csv"

    # generate indices for training and validation
    a = np.arange(len(pd.read_csv(training_data_path)))
    # get the specified fraction of test data to use
    frac = args["train_data_fraction"]
    tr_ind, val_ind = train_test_split(a,
                                       train_size=0.8 * frac,
                                       test_size=0.2 * frac,
                                       shuffle=True)

    # init training data loader
    train_loader = make_data_loader(training_data_path,
                                    indices=tr_ind,
                                    batch_size=args["batch_size"],
                                    feature_cols=args["feature_cols"],
                                    label_cols=args["label_cols"])
    # init validation data loader
    val_loader = make_data_loader(training_data_path,
                                  indices=val_ind,
                                  batch_size=args["batch_size"],
                                  feature_cols=args["feature_cols"],
                                  label_cols=args["label_cols"])

    # use Huber loss since don't care about outliers
    criterion = torch.nn.SmoothL1Loss(reduction='none')

    # start training
    train(device,
          model_dir_path,
          train_loader,
          val_loader,
          args["nn_layers"],
          args["epochs"],
          args["lr"],
          args["weight_decay"],
          criterion=criterion,
          loss_weights=args["loss_weights"])