예제 #1
0
def fit(input_file=constants.FILE_DATA):
    log = logging.getLogger(constants.LOGGER_CLUSTER)

    log.info("Getting data from file")
    df = preprocess.get_data(input_file, 1)

    #Normalize data
    log.info("Normalizing data")
    #Normalize data
    dataset = scaling.scale_data(df)
    #Determine number of clusters
    if (constants.N_CLUSTERS == -1):
        #Calculate clusters
        n_clusters = dataset.shape[0] / constants.CLUSTER_FACTOR
    else:
        #Directly use number of clusters provided in config
        n_clusters = constants.N_CLUSTERS

    #Make model
    log.info("Clustering on %s with %d clusters" % (input_file, n_clusters))
    model = cluster.MiniBatchKMeans(
        init=constants.INIT,
        n_clusters=n_clusters,
        batch_size=constants.BATCH_SIZE,
        n_init=constants.N_INIT,
        max_no_improvement=constants.MAX_NO_OF_IMPROVEMENT,
        verbose=constants.VERBOSE,
        random_state=constants.RANDOM_STATE)
    model.fit(dataset)

    print 'Clustering successful'

    #Save model to disk
    log.info("Saving models to disk")
    joblib.dump(model, constants.FILE_CLUSTER_MODEL)

    return
예제 #2
0
def fit(input_file=constants.FILE_PARTIAL_DATA):
    log = logging.getLogger(constants.LOGGER_PARTIAL_FIT)

    log.info("Partial fit on %s" % input_file)
    df = preprocess.get_data(input_file, 1, to_append=True)
    log.debug("Size of dataset: %d x %d" % df.shape)
    #Normalize data
    dataset = scaling.scale_new_data(df)

    #Load cluster model from disk
    model = joblib.load(constants.FILE_CLUSTER_MODEL)
    lists = model.labels_
    log.debug("Old labels size: %d" % lists.shape)

    #Partial fit on the old trained model
    model.partial_fit(dataset)

    #Append the labels
    lists = np.append(lists, model.labels_)
    model.labels_ = lists

    log.debug("New labels size: %d" % lists.shape)
    #Save updated cluster model to disk
    joblib.dump(model, constants.FILE_CLUSTER_MODEL)
예제 #3
0
import torch
import argparse
from torch.utils import data as torchdata

from utils.preprocess import get_data

"""
python3 train.py -d ../multi30k-dataset -batch_size 4
"""

def get_args():
    parser = argparse.ArgumentParser(description='Train unsupervised model.')
    parser.add_argument('-d', metavar='-d', type=str,
                        help='directory multi30k was cloned to (https://github.com/multi30k/dataset)')
    parser.add_argument('-batch_size', type=int,
                        help='batch_size for training and validating dataloaders')
    return parser.parse_args()

args = get_args()
trainloader, valloader, TEXT = get_data(args)
trainbatcher = torchdata.DataLoader(trainloader, batch_size=args.batch_size, num_workers=4, shuffle=True)
valbatcher = torchdata.DataLoader(valloader, batch_size=args.batch_size, num_workers=4, shuffle=False)

for batch_ix,batch in enumerate(trainbatcher):
    print(batch_ix, batch)
    print([TEXT.itos[i][1] for i in batch['en'][0] if not i==-2])
    print([TEXT.itos[i][1] for i in batch['de'][0] if not i==-2])
    print([TEXT.itos[i][1] for i in batch['fr'][0] if not i==-2])
    assert False
예제 #4
0
vis = visdom.Visdom()
vis.env = 'train'

#############################

model_dict = {
    'type': 'Attention',
    'D': 300,
    'num_encode': 2,
    'num_decode': 2,
    'num_epochs': 50,
    'fake': True,
    'pickled_fields': True
}

train_iter, val_iter, DE, EN = get_data(model_dict)

###########
model = torch.load('Attention2.p')
model.encoder.flatten_parameters()
model.decoder.flatten_parameters()
model.eval()
fname = 'PSET/source_test.txt'
answers = []
with open(fname, 'rb') as reader:
    for break_ix, line in tqdm(enumerate(reader)):
        src = Variable(
            torch.Tensor([
                DE.vocab.stoi[s]
                for s in line.decode('utf-8').strip('\n').split(' ')
            ]).long().unsqueeze(1))
예제 #5
0
                        help='type of model')
    parser.add_argument('-hidden',
                        metavar='-hidden',
                        type=int,
                        help='the size of z')
    parser.add_argument('-lr', metavar='-lr', type=float, help='learning rate')
    parser.add_argument('-epochs',
                        metavar='-epochs',
                        type=int,
                        help='number of epochs')

    return parser.parse_args()


args = get_args()
train_loader, val_loader, test_loader = get_data(args)
model = get_model(args)
optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

for epoch in range(args.epochs):
    for data_ix, (img, label) in enumerate(train_loader):
        model.train()
        optimizer.zero_grad()

        mu, logvar = model.get_encoding(Variable(img).cuda())
        std = logvar.mul(0.5).exp_()
        eps = Variable(std.data.new(std.size()).normal_())
        z = eps.mul(std).add_(mu)
        img_out = model.get_decoding(z)

        l_reconstruct = F.binary_cross_entropy_with_logits(
예제 #6
0
from tqdm import tqdm
import numpy as np
import visdom

from utils.models import Ensemb, model_dict
from utils.preprocess import get_data, get_model
from utils.postprocess import evaluate, write_submission, vis_display

# visdom
vis_windows = None
vis = visdom.Visdom()
vis.env = 'train'

#############################

train_iter, val_iter, test_iter, TEXT = get_data(model_dict)

model = get_model(model_dict)

trainable = False
if len(list(model.parameters())) > 0:
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0003)
    trainable = True

for epoch in range(model_dict['num_epochs']):
    for batch_num, batch in enumerate(tqdm(train_iter)):
        if trainable:
            if len(batch.text) < model_dict['lookback'] + 1:
                continue
            model.train()
            optimizer.zero_grad()
예제 #7
0
# set up visdom
vis = visdom.Visdom()
vis.env = 'train'
vis_windows = None

# define model
# chosen_model = {'type': 'MNB', 'alpha':[.5], 'should_plot':False, 'counts': False, 'batch_size': 50}
# chosen_model = {'type': 'log_reg', 'batch_size': 150, 'counts':False}
# chosen_model = {'type': 'CBOW', 'batch_size': 100, 'pool': 'max'}
# chosen_model = {'type': 'Conv', 'batch_size': 50, 'embed_type': 'glove'}
chosen_model = {'type': 'resnet', 'batch_size': 4}
print(chosen_model)

# get data
TEXT, LABEL, train_iter, val_iter, test_iter = get_data(chosen_model)

if chosen_model['type'] == 'MNB':
    all_scores = []
    for alpha in chosen_model['alpha']:
        model = MNB(V=len(TEXT.vocab), alpha=.1, counts=chosen_model['counts'])

        for batch_num, batch in enumerate(tqdm(train_iter)):
            model.train_sample(batch.label.data - 1, batch.text.data)
        model.postprocess()
        print_important(model.w.weight.data.cpu().squeeze(0), TEXT, 10)
        bce, roc, acc = evaluate_model(model, test_iter)
        print('alpha:', alpha)
        print('BCE:', bce, '  ROC:', roc, '  ACC:', acc)
        all_scores.append((bce, roc, acc))
    if chosen_model['should_plot']:
예제 #8
0
                        help='type of model')
    parser.add_argument('-hidden',
                        metavar='-hidden',
                        type=int,
                        help='the size of z')
    parser.add_argument('-lr', metavar='-lr', type=float, help='learning rate')
    parser.add_argument('-epochs',
                        metavar='-epochs',
                        type=int,
                        help='number of epochs')

    return parser.parse_args()


args = get_args()
train_loader, val_loader, test_loader = get_data(args, bern=False)
model = get_model(args)
optimizer_g = torch.optim.Adam(model.decoder.parameters(), lr=args.lr)
optimizer_d = torch.optim.Adam(model.discrim.parameters(), lr=args.lr)

for epoch in range(args.epochs):
    for data_ix, (img, label) in enumerate(train_loader):
        model.train()
        img = (2 * (img - .5)).squeeze(1)
        z = Variable(torch.zeros(len(img), args.hidden).normal_().cuda())
        img_g = model.get_decoding(z)

        # train the discriminator
        optimizer_d.zero_grad()
        l_d = 0
        preds = model.get_discrim(Variable(img.view(-1,