def fit(input_file=constants.FILE_DATA): log = logging.getLogger(constants.LOGGER_CLUSTER) log.info("Getting data from file") df = preprocess.get_data(input_file, 1) #Normalize data log.info("Normalizing data") #Normalize data dataset = scaling.scale_data(df) #Determine number of clusters if (constants.N_CLUSTERS == -1): #Calculate clusters n_clusters = dataset.shape[0] / constants.CLUSTER_FACTOR else: #Directly use number of clusters provided in config n_clusters = constants.N_CLUSTERS #Make model log.info("Clustering on %s with %d clusters" % (input_file, n_clusters)) model = cluster.MiniBatchKMeans( init=constants.INIT, n_clusters=n_clusters, batch_size=constants.BATCH_SIZE, n_init=constants.N_INIT, max_no_improvement=constants.MAX_NO_OF_IMPROVEMENT, verbose=constants.VERBOSE, random_state=constants.RANDOM_STATE) model.fit(dataset) print 'Clustering successful' #Save model to disk log.info("Saving models to disk") joblib.dump(model, constants.FILE_CLUSTER_MODEL) return
def fit(input_file=constants.FILE_PARTIAL_DATA): log = logging.getLogger(constants.LOGGER_PARTIAL_FIT) log.info("Partial fit on %s" % input_file) df = preprocess.get_data(input_file, 1, to_append=True) log.debug("Size of dataset: %d x %d" % df.shape) #Normalize data dataset = scaling.scale_new_data(df) #Load cluster model from disk model = joblib.load(constants.FILE_CLUSTER_MODEL) lists = model.labels_ log.debug("Old labels size: %d" % lists.shape) #Partial fit on the old trained model model.partial_fit(dataset) #Append the labels lists = np.append(lists, model.labels_) model.labels_ = lists log.debug("New labels size: %d" % lists.shape) #Save updated cluster model to disk joblib.dump(model, constants.FILE_CLUSTER_MODEL)
import torch import argparse from torch.utils import data as torchdata from utils.preprocess import get_data """ python3 train.py -d ../multi30k-dataset -batch_size 4 """ def get_args(): parser = argparse.ArgumentParser(description='Train unsupervised model.') parser.add_argument('-d', metavar='-d', type=str, help='directory multi30k was cloned to (https://github.com/multi30k/dataset)') parser.add_argument('-batch_size', type=int, help='batch_size for training and validating dataloaders') return parser.parse_args() args = get_args() trainloader, valloader, TEXT = get_data(args) trainbatcher = torchdata.DataLoader(trainloader, batch_size=args.batch_size, num_workers=4, shuffle=True) valbatcher = torchdata.DataLoader(valloader, batch_size=args.batch_size, num_workers=4, shuffle=False) for batch_ix,batch in enumerate(trainbatcher): print(batch_ix, batch) print([TEXT.itos[i][1] for i in batch['en'][0] if not i==-2]) print([TEXT.itos[i][1] for i in batch['de'][0] if not i==-2]) print([TEXT.itos[i][1] for i in batch['fr'][0] if not i==-2]) assert False
vis = visdom.Visdom() vis.env = 'train' ############################# model_dict = { 'type': 'Attention', 'D': 300, 'num_encode': 2, 'num_decode': 2, 'num_epochs': 50, 'fake': True, 'pickled_fields': True } train_iter, val_iter, DE, EN = get_data(model_dict) ########### model = torch.load('Attention2.p') model.encoder.flatten_parameters() model.decoder.flatten_parameters() model.eval() fname = 'PSET/source_test.txt' answers = [] with open(fname, 'rb') as reader: for break_ix, line in tqdm(enumerate(reader)): src = Variable( torch.Tensor([ DE.vocab.stoi[s] for s in line.decode('utf-8').strip('\n').split(' ') ]).long().unsqueeze(1))
help='type of model') parser.add_argument('-hidden', metavar='-hidden', type=int, help='the size of z') parser.add_argument('-lr', metavar='-lr', type=float, help='learning rate') parser.add_argument('-epochs', metavar='-epochs', type=int, help='number of epochs') return parser.parse_args() args = get_args() train_loader, val_loader, test_loader = get_data(args) model = get_model(args) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) for epoch in range(args.epochs): for data_ix, (img, label) in enumerate(train_loader): model.train() optimizer.zero_grad() mu, logvar = model.get_encoding(Variable(img).cuda()) std = logvar.mul(0.5).exp_() eps = Variable(std.data.new(std.size()).normal_()) z = eps.mul(std).add_(mu) img_out = model.get_decoding(z) l_reconstruct = F.binary_cross_entropy_with_logits(
from tqdm import tqdm import numpy as np import visdom from utils.models import Ensemb, model_dict from utils.preprocess import get_data, get_model from utils.postprocess import evaluate, write_submission, vis_display # visdom vis_windows = None vis = visdom.Visdom() vis.env = 'train' ############################# train_iter, val_iter, test_iter, TEXT = get_data(model_dict) model = get_model(model_dict) trainable = False if len(list(model.parameters())) > 0: optimizer = torch.optim.Adam(model.parameters(), lr=0.0003) trainable = True for epoch in range(model_dict['num_epochs']): for batch_num, batch in enumerate(tqdm(train_iter)): if trainable: if len(batch.text) < model_dict['lookback'] + 1: continue model.train() optimizer.zero_grad()
# set up visdom vis = visdom.Visdom() vis.env = 'train' vis_windows = None # define model # chosen_model = {'type': 'MNB', 'alpha':[.5], 'should_plot':False, 'counts': False, 'batch_size': 50} # chosen_model = {'type': 'log_reg', 'batch_size': 150, 'counts':False} # chosen_model = {'type': 'CBOW', 'batch_size': 100, 'pool': 'max'} # chosen_model = {'type': 'Conv', 'batch_size': 50, 'embed_type': 'glove'} chosen_model = {'type': 'resnet', 'batch_size': 4} print(chosen_model) # get data TEXT, LABEL, train_iter, val_iter, test_iter = get_data(chosen_model) if chosen_model['type'] == 'MNB': all_scores = [] for alpha in chosen_model['alpha']: model = MNB(V=len(TEXT.vocab), alpha=.1, counts=chosen_model['counts']) for batch_num, batch in enumerate(tqdm(train_iter)): model.train_sample(batch.label.data - 1, batch.text.data) model.postprocess() print_important(model.w.weight.data.cpu().squeeze(0), TEXT, 10) bce, roc, acc = evaluate_model(model, test_iter) print('alpha:', alpha) print('BCE:', bce, ' ROC:', roc, ' ACC:', acc) all_scores.append((bce, roc, acc)) if chosen_model['should_plot']:
help='type of model') parser.add_argument('-hidden', metavar='-hidden', type=int, help='the size of z') parser.add_argument('-lr', metavar='-lr', type=float, help='learning rate') parser.add_argument('-epochs', metavar='-epochs', type=int, help='number of epochs') return parser.parse_args() args = get_args() train_loader, val_loader, test_loader = get_data(args, bern=False) model = get_model(args) optimizer_g = torch.optim.Adam(model.decoder.parameters(), lr=args.lr) optimizer_d = torch.optim.Adam(model.discrim.parameters(), lr=args.lr) for epoch in range(args.epochs): for data_ix, (img, label) in enumerate(train_loader): model.train() img = (2 * (img - .5)).squeeze(1) z = Variable(torch.zeros(len(img), args.hidden).normal_().cuda()) img_g = model.get_decoding(z) # train the discriminator optimizer_d.zero_grad() l_d = 0 preds = model.get_discrim(Variable(img.view(-1,