def extract_pages(origin_file, pages_dir): errors = [] wiki_file = open(origin_file, 'r') page_counter = 0 letter = wiki_file.read(1) read_letters = '' while letter != '': read_letters += letter if read_letters[-6:] == '<page>': try: extract_page(wiki_file, pages_dir) page_counter += 1 read_letters = '' if page_counter % 200 == 0: print_inline(".") if page_counter % 10000 == 0: print_inline('\n' + str(page_counter) + ' páginas') except Exception as e: errors.append((e, e.file_name)) letter = wiki_file.read(1) print 'Quantidade de erros: ' + str(len(errors)) print errors
def __init__(self): dictionary = Word2Vec.load(params.dictionary_path) self.common_words = pickle.load(open(params.common_words_path, 'rb')) self.uncommon_words = pickle.load(open(params.uncommon_words_path, 'rb')) self.word_vectors = dictionary.wv del dictionary embeddings = word_vectors_2_embedding(self.word_vectors) self.embedding_table = nn.Embedding(*embeddings.shape) self.embedding_table.weight.data.copy_(torch.tensor(embeddings)) self.embedding_table.weight.requires_grad = False print('Finish loading Word2Vec model! Size: ({},{})'.format(embeddings.shape[0], embeddings.shape[1])) self.embedding_size = embeddings.shape[1] self.net = GatedCNN(self.embedding_size, params.num_channels) if torch.cuda.is_available(): self.net = self.net.cuda() self.net.load_state_dict(torch.load(params.save_path)) else: self.net.load_state_dict(torch.load(params.save_path, map_location=lambda storage, loc: storage)) self.net.eval() with open('assets/items.pickle', mode='rb') as fp: self.items = pickle.load(fp) if os.path.exists(params.embedding_path): with open(params.embedding_path, mode='rb') as fp: self.embedding_items = pickle.load(fp) else: batch = build_batch(self.items, self.word_vectors, 10, self.common_words, self.uncommon_words) self.embedding_items = {} count = 0 for newsId, index_vector in batch.items(): count += 1 print_inline('Calculating item embedding {}/{}'.format(count, len(self.items))) try: index_vector = torch.tensor(index_vector) inputs = self.embedding_table(index_vector) inputs = torch.FloatTensor(inputs) inputs = inputs.unsqueeze(0).permute(0, 2, 1) # (batch_size, embedding_size, seq_len) if torch.cuda.is_available(): inputs = inputs.cuda() inputs = Variable(inputs) doc_embedding = self.net(inputs)[0].cpu().detach().numpy() doc_embedding = doc_embedding / LA.norm(doc_embedding) self.embedding_items[newsId] = doc_embedding except Exception as e: print(e) with open(params.embedding_path, mode='wb') as fp: pickle.dump(self.embedding_items, fp, pickle.HIGHEST_PROTOCOL) embedding_items = np.array(list(self.embedding_items.values())) self.mean_vector, self.std_vector = np.mean(embedding_items, axis=0), np.std(embedding_items, axis=0) self.embedding_items = {k: (v - self.mean_vector) for k, v in self.embedding_items.items()}
def optimize(self, nnet): timer = Stopwatch(verbose=False).start() self.total_epochs += self.max_epochs for i in xrange(self.max_epochs): self.epoch += 1 if self.verbose: print_inline('Epoch {0:>{1}}/{2} '.format( self.epoch, len(str(self.total_epochs)), self.total_epochs)) if self.verbose and self.early_stopping and nnet._X_val is not None: print_inline(' early stopping after {0} '.format( self._early_stopping)) losses = self.train_epoch(nnet) self.loss_history.append(losses) msg = 'elapsed: {0} sec'.format( width_format(timer.elapsed(), default_width=5, max_precision=2)) msg += ' - loss: {0}'.format( width_format(np.mean(losses), default_width=5, max_precision=4)) score = nnet._metric(nnet._y, nnet.validate()) self.score_history.append(score) # TODO: change acc to metric name msg += ' - acc.: {0}'.format( width_format(score, default_width=6, max_precision=4)) if nnet._X_val is not None: if self._early_stopping > 0 and self.epoch > 1: self._early_stopping -= 1 val_loss = nnet._loss(nnet._y_val, nnet.validate_proba(nnet._X_val)) self.val_loss_history.append(val_loss) val_score = nnet._metric(nnet._y_val, nnet.validate(nnet._X_val)) if self.epoch > 1 and val_score < 0.2 * self.val_score_history[ -1]: return self.val_score_history.append(val_score) if self.epoch > 1 and val_score > nnet.best_val_score_: nnet.best_val_score_ = val_score nnet.best_epoch_ = self.epoch # TODO move to optimizer nnet._save_best_weights() self._early_stopping = self.early_stopping # reset counter msg += ' - val. loss: {0}'.format( width_format(val_loss, default_width=5, max_precision=4)) # TODO: fix acc. msg += ' - val. acc.: {0}'.format( width_format(val_score, default_width=6, max_precision=4)) if self._early_stopping == 0: if self.verbose: print msg return if self.verbose: print msg if self.epoch > 1 and self.plot: if not os.path.exists(self.plot_dirpath): os.makedirs(self.plot_dirpath) plot_learning_curves(self.loss_history, self.score_history, self.val_loss_history, self.val_score_history, dirpath=self.plot_dirpath)
def train_epoch(self, X): mean_recons = [] for i, X_batch in enumerate(self.batch_iter(X)): mean_recons.append(self.update(X_batch)) if self.verbose and i % (len(X) / (self.batch_size * 16)) == 0: print_inline('.') if self.verbose: print_inline(' ') return np.mean(mean_recons)
def train_epoch(self, train_loader): self.model.train() epoch_iter = 0 epoch_train_loss = 0. epoch_correct = 0 epoch_total = 0 epoch_acc = 0. epoch_train_loss_history = [] for (X_batch, manip), (y_batch, soft_logits) in progress_iter(iterable=train_loader, verbose=self.verbose, leave=True, ncols=64, desc='epoch'): if self.use_cuda: X_batch, y_batch = X_batch.cuda(), y_batch.cuda() manip = manip.cuda() soft_logits = soft_logits.cuda() X_batch, y_batch = Variable(X_batch), Variable(y_batch) manip = Variable(manip, requires_grad=False) soft_logits = Variable(soft_logits, requires_grad=False) self.optim.zero_grad() out = self.model((X_batch, manip)) loss = self.loss_func(out, y_batch) if self.distill_cost > 1e-6: loss += 0.5 * self._get_distill_multiplier() * torch.mean( (out - out.mean(1).view(-1, 1) - soft_logits)**2.) epoch_train_loss_history.append(loss.data[0]) epoch_train_loss *= epoch_iter / (epoch_iter + 1.) epoch_train_loss += loss.data[0] / (epoch_iter + 1.) epoch_iter += 1 _, y_pred = torch.max(out.data, 1) epoch_correct += y_pred.eq(y_batch.data).cpu().sum() epoch_total += y_batch.size(0) epoch_acc = epoch_correct / float(epoch_total) if self.verbose: s = "loss: {0:.4f} acc: {1:.4f}".format( epoch_train_loss, epoch_acc) print_inline(s) loss.backward() #create_graph=True, retain_graph=True) self.optim.step() # update global history self.train_loss_history.append(epoch_train_loss_history) self.train_acc_history.append(epoch_acc) # update cyclic LR if enabled if self.cyclic_lr: lrm = self._get_cyclic_lrm() self._mul_lr_by(lrm)
def train_epoch(self, nnet): self._setup(nnet) losses = [] for X_batch, y_batch in nnet.batch_iter(): if self.verbose: print_inline('.') loss = np.mean(nnet.update(X_batch, y_batch)) self.update(nnet) nnet._max_norm_update() losses.append(loss) if self.verbose: print return losses # epoch losses
def train(samples, word_vectors, net, optimizer, criterion, epoch): # shuffle train set random.shuffle(samples) acc_loss = 0.0 total_step = 0 len_samples = len(samples) n_batches = len_samples // params.batch_size if (len_samples - n_batches * params.batch_size) != 0: n_batches += 1 for batch_idx, i in enumerate(range(n_batches), 1): start = i * params.batch_size end = start + params.batch_size batch_samples = samples[start:end] mini_batches = sample_prediction_point(batch_samples, word_vectors) batch_loss = 0.0 for inputs, targets in mini_batches: inputs, targets = torch.LongTensor(inputs), torch.LongTensor( targets) bs = inputs.shape[0] labels = torch.cat([ torch.ones(bs, params.n_positive), torch.zeros(bs, params.n_negative) ], 1) if torch.cuda.is_available(): inputs, targets, labels = inputs.cuda(), targets.cuda( ), labels.cuda() inputs, targets, labels = Variable(inputs), Variable( targets), Variable(labels) # zero the parameter gradients optimizer.zero_grad() logits = net(inputs, targets) loss = criterion(logits, labels) batch_loss += loss.item() loss.backward() # torch.nn.utils.clip_grad_norm(net.network.cnn.parameters(), 0.25) optimizer.step() acc_loss += batch_loss total_step += len(mini_batches) print_inline( 'Train Epoch: {} [{} / {} ({:.1f}%)] Learning Rate: {} Loss: {:.6f}' .format(epoch, str(batch_idx).ljust(int(floor(log10(n_batches))), ' '), n_batches, 100. * batch_idx / n_batches, _get_learning_rate(optimizer)[0], batch_loss / len(mini_batches))) acc_loss /= total_step return acc_loss
def build_batch(items, wv, min_doc_length, common_words, uncommon_words): batch = {} count = 0 docs_sentences = get_sentences(items) for newsId, sentences in docs_sentences.items(): print_inline('Pre-process items {}/{}'.format(count, len(docs_sentences))) count += 1 words = [w for s in sentences for w in s.strip().split()] if len(words) < min_doc_length: continue words_indices = [get_word_index(wv, word, common_words, uncommon_words) for word in words] batch[newsId] = words_indices return batch
def preprocess_dataset(fields=None): if fields is None: fields = ['title_token', 'sapo_token', 'content_token', 'tag_token'] assets_folder = 'assets' pathlib.Path(assets_folder).mkdir(parents=True, exist_ok=True) results = {} sentence_length_arr = [] with open('dataset/items.txt', 'r') as fp: with open(assets_folder + '/items.txt', 'w') as fw: count = 0 while True: line = fp.readline().strip() if not line: break if random.random() > params.ratio: # pick 20% continue fw.write(line + os.linesep) count += 1 print_inline(count) item = json.loads(line) for field in fields: text = item.get(field) if text is None: continue sentences = nltk.tokenize.sent_tokenize(text) normalized_text = "" for sentence in sentences: sentence_length, sentence = normalize(sentence) sentence_length_arr.append(sentence_length) normalized_text += sentence + " . " item[field] = normalized_text.strip(". ") results[item.get('newsId')] = item print("Average length of each sentence is: %.2f" % (sum(sentence_length_arr) / len(sentence_length_arr))) del sentence_length with open(assets_folder + '/items.pickle', 'wb') as fp: pickle.dump(results, fp, pickle.HIGHEST_PROTOCOL)
def iterate_over_files(): file_counter = 0 for root, dirs, files in os.walk(PAGES_DIR): for f in files: page_path = os.path.join(root, f) page_content = read_file(page_path) text = markup_formatter.format_text(page_content) text_dir = get_dir(f) text_path = os.path.join(text_dir, f) save_file(text_path, text) file_counter += 1 if file_counter % 200 == 0: print_inline('.') if file_counter % 10000 == 0: print_inline('\n' + str(file_counter) + ' páginas')
def generate_pair(filepath, block_size, add_noise): base_log = 'Processing ' + filepath + ' - ' print_inline(base_log + 'reading raw') raw_image = dcraw.read_raw(filepath) chunk_matrix = split_image_into_chunks(raw_image) subsampled_chunks = [] new_original_chunks = [] print_inline(base_log + 'subsampling ' + str(len(chunk_matrix) * len(chunk_matrix[0])) + ' chunks') for row_index, row in enumerate(chunk_matrix): subsampled_row = [] new_original_row = [] for index, chunk in enumerate(row): if add_noise: [noise_params] = estimator.estimate_noise(chunk) # Clip the noise to a reasonable interval noise_params = [ max(NOISE_A_MIN_VALUE, min(NOISE_A_MAX_VALUE, noise_params[0])), max(NOISE_B_MIN_VALUE, min(NOISE_B_MAX_VALUE, noise_params[1])) ] numpy_image = np.matrix.transpose(np.array(chunk)) subsampled_chunk = subsample_image(numpy_image, block_size) groundtruth_chunk = Image.fromarray(subsampled_chunk, mode='RGB') input_chunk = three_channel_to_bayer(groundtruth_chunk) if add_noise: numpy_input_chunk = np.matrix.transpose(np.array(input_chunk)) noised_input_chunk = estimator.apply_noise( numpy_input_chunk, noise_params[0], noise_params[1]) input_chunk = Image.fromarray( np.matrix.transpose(noised_input_chunk), mode='L') subsampled_row.append(groundtruth_chunk) new_original_row.append(input_chunk) subsampled_chunks.append(subsampled_row) new_original_chunks.append(new_original_row) groundtruth_image = join_chunks_into_image(subsampled_chunks) new_original_image = join_chunks_into_image(new_original_chunks) # Generated images must have even dimensions. width, height = groundtruth_image.size if width % 2 != 0 or height % 2 != 0: new_width = width if width % 2 == 0 else (width - 1) new_height = height if height % 2 == 0 else (height - 1) groundtruth_image = groundtruth_image.crop( (0, 0, new_width, new_height)) new_original_image = new_original_image.crop( (0, 0, new_width, new_height)) print_inline(base_log + 'creating initial image') return new_original_image, groundtruth_image
def print_formatted(datas): """Pretty print JSON DATA Argument: datas: dictionary of data """ if not datas: print("No data") exit(1) if isinstance(datas, list): # get all zones # API /zone without :identifier hr() print('%-20s %-8s %-12s' % ('name', 'type', 'notified_serial')) hr() for record in datas: # print 'NAME' utils.print_inline("%(name)-20s" % record) # print 'TYPE' of SOA record utils.print_inline("%(type)-8s" % record) if record.get('notified_serial'): print("%(notified_serial)s" % record) else: print('') exit(0) elif datas.get('records'): print("domain: %(name)s" % datas) if datas.get('type') == 'MASTER' and datas.get('notified_serial'): print("serial: %(notified_serial)s" % datas) print("DNS : %(type)s" % datas) # print header hr() print('%-33s %-5s %-25s %-5s %-3s' % ('name', 'type', 'content', 'ttl', 'prio')) hr() for record in datas.get('records'): # print 'NAME' utils.print_inline("%(name)-33s" % record) # print 'TYPE' of SOA record if record.get('type') == 'SOA': print("%(type)-5s" % record) # print 'TYPE' of non SOA record else: utils.print_inline("%(type)-5s" % record) # print 'CONTENT' of non SOA if record.get('type') == 'SOA': utils.print_inline(">\t\t%(content)-25s " % record) # print 'CONTENT' of SOA record else: utils.print_inline("%(content)-25s" % record) # print TTL, and PRIORITY for MX, SRV record if record.get('priority'): utils.print_inline("%(ttl)5s" % record) print("%(priority)2s" % record) # print ttl for non SOA record else: print("%(ttl)5s " % record) hr() elif datas.get('identifier'): # for template print("identifier : %(identifier)s" % datas) print("description: %(description)s" % datas) hr() print('%-33s %-5s %-25s %-5s %-3s' % ('name', 'type', 'content', 'ttl', 'prio')) for record in datas.get('entries'): # print 'NAME' utils.print_inline("%(name)-33s" % record) # print 'TYPE' for SOA if record.get('type') == 'SOA': print("%(type)-5s" % record) # print 'TYPE' for non SOA else: utils.print_inline("%(type)-5s" % record) # print 'CONTENT' for SOA if record.get('type') == 'SOA': utils.print_inline("> %(content)-25s " % record) # print 'CONTENT' for non SOA else: utils.print_inline("%(content)-24s" % record) # print 'TTL', and 'PRIORITY' if record.get('priority') is not None: utils.print_inline("%(ttl)5s" % record) print("%(priority)2s" % record) # print else: print("%(ttl)5s " % record) hr() else: print("No match records")
def main(): start_at = time.time() dictionary = Word2Vec.load(params.dictionary_path) word_vectors = dictionary.wv del dictionary embedding = word_vectors_2_embedding(word_vectors) print('Finish loading Word2Vec model! Size: ({},{})'.format( embedding.shape[0], embedding.shape[1])) net = UnsupervisedCNNEmbeddingNetwork(embedding, params.num_channels, pos=params.n_positive, neg=params.n_negative) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") net.to(device) net.train() # optimizer = optim.SGD([{ # 'params': net.module.network.cnn.parameters() if torch.cuda.device_count() > 1 else net.network.cnn.parameters() # }, { # 'params': net.module.network.fc.parameters() if torch.cuda.device_count() > 1 else net.network.fc.parameters(), # 'weight_decay': params.weight_decay # }], lr=params.learning_rate, momentum=params.momentum) optimizer = optim.Adadelta([{ 'params': net.network.cnn.parameters() }, { 'params': net.network.fc.parameters(), 'weight_decay': params.weight_decay }], lr=params.learning_rate, rho=0.9, eps=1e-06) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.3, patience=1, min_lr=1e-3, verbose=True) criterion = nn.BCEWithLogitsLoss() try: data_path = 'assets/data.pickle' pathlib.Path(data_path).parent.mkdir(parents=True, exist_ok=True) if os.path.exists(data_path): with open(data_path, mode='rb') as fp: samples = pickle.load(fp) else: items = pickle.load(open('assets/items.pickle', mode='rb')) common_words = pickle.load(open(params.common_words_path, 'rb')) uncommon_words = pickle.load(open(params.uncommon_words_path, 'rb')) min_doc_length = params.min_offset + params.n_positive samples = build_batch(items, word_vectors, min_doc_length, common_words, uncommon_words) samples = list(samples.values()) del items, common_words, uncommon_words with open(data_path, mode='wb') as fp: pickle.dump(samples, fp, pickle.HIGHEST_PROTOCOL) print('\nNumber of samples: %d' % len(samples)) print("Training...") for epoch in range(1, params.n_epochs + 1): # loop over the dataset multiple times acc_loss = train(samples, word_vectors, net, optimizer, criterion, epoch) # print statistics print_inline('[{:3d}] loss: {:.5f} - learning rate: {}\n'.format( epoch, acc_loss, _get_learning_rate(optimizer)[0])) # Save the model if the validation loss is the best we've seen so far. if not scheduler.best or scheduler.is_better( acc_loss, scheduler.best): with open(params.save_path, 'wb') as f: torch.save(net.network.state_dict(), f) scheduler.step(acc_loss) except KeyboardInterrupt: print('-' * 89) print('Exiting from training early') print('-' * 89) finally: end_at = time.time() print("start at: {}\nend_at: {}\nruntime: {} min".format( time.ctime(start_at), time.ctime(end_at), (end_at - start_at) / 60)) print('Finished Training\n')
def _fit(self, X): if not self._initialized: layer = FullyConnected(self.n_hidden, bias=0., random_seed=self.random_seed) layer.setup_weights(X.shape) self.W = layer.W self.vb = np.zeros(X.shape[1]) self.hb = layer.b self._dW = np.zeros_like(self.W) self._dvb = np.zeros_like(self.vb) self._dhb = np.zeros_like(self.hb) self._rng = RNG(self.random_seed) self._rng.reseed() timer = Stopwatch(verbose=False).start() for _ in xrange(self.n_epochs): self.epoch += 1 if self.verbose: print_inline('Epoch {0:>{1}}/{2} '.format( self.epoch, len(str(self.n_epochs)), self.n_epochs)) if isinstance(self.learning_rate, str): S, F = map(float, self.learning_rate.split('->')) self._learning_rate = S + (F - S) * ( 1. - np.exp(-(self.epoch - 1.) / 8.)) / ( 1. - np.exp(-(self.n_epochs - 1.) / 8.)) else: self._learning_rate = self.learning_rate if isinstance(self.momentum, str): S, F = map(float, self.momentum.split('->')) self._momentum = S + (F - S) * ( 1. - np.exp(-(self.epoch - 1) / 4.)) / ( 1. - np.exp(-(self.n_epochs - 1) / 4.)) else: self._momentum = self.momentum mean_recon = self.train_epoch(X) if mean_recon < self.best_recon: self.best_recon = mean_recon self.best_epoch = self.epoch self.best_W = self.W.copy() self.best_vb = self.vb.copy() self.best_hb = self.hb.copy() self._early_stopping = self.early_stopping msg = 'elapsed: {0} sec'.format( width_format(timer.elapsed(), default_width=5, max_precision=2)) msg += ' - recon. mse: {0}'.format( width_format(mean_recon, default_width=6, max_precision=4)) msg += ' - best r-mse: {0}'.format( width_format(self.best_recon, default_width=6, max_precision=4)) if self.early_stopping: msg += ' {0}*'.format(self._early_stopping) if self.verbose: print msg if self._early_stopping == 0: return if self.early_stopping: self._early_stopping -= 1
def main(): dictionary = Word2Vec.load(params.dictionary_path) common_words = pickle.load(open(params.common_words_path, 'rb')) uncommon_words = pickle.load(open(params.uncommon_words_path, 'rb')) word_vectors = dictionary.wv del dictionary embeddings = word_vectors_2_embedding(word_vectors) embedding_table = nn.Embedding(*embeddings.shape) embedding_table.weight.data.copy_(torch.tensor(embeddings)) embedding_table.weight.requires_grad = False print('Finish loading Word2Vec model! Size: ({},{})'.format( embeddings.shape[0], embeddings.shape[1])) embedding_size = embeddings.shape[1] test_words = ['đẹp', 'Ronaldo', 'Covid'] for test_word in test_words: _, test_word = normalize(test_word) print("*" * 90) print("Danh sách từ khóa cùng ngữ cảnh với từ: %s" % test_word) test_word_embedding = word_vectors[test_word] scores = np.matmul(embeddings, test_word_embedding) print([ word_vectors.index2word[top_idx] for top_idx in np.argsort(scores)[-2:-12:-1] ]) print("*" * 90) net = GatedCNN(embedding_size, params.num_channels) if torch.cuda.is_available(): net = net.cuda() net.load_state_dict(torch.load(params.save_path)) else: net.load_state_dict( torch.load(params.save_path, map_location=lambda storage, loc: storage)) net.eval() with open('assets/items.pickle', mode='rb') as fp: items = pickle.load(fp) if os.path.exists(params.embedding_path): with open(params.embedding_path, mode='rb') as fp: embedding_items = pickle.load(fp) else: batch = build_batch(items, word_vectors, common_words, uncommon_words) # save and clean del common_words, uncommon_words embedding_items = {} count = 0 for newsId, index_vector in batch.items(): count += 1 print_inline('Calculating item embedding {}/{}'.format( count, len(items))) try: index_vector = torch.tensor(index_vector) inputs = embedding_table(index_vector) inputs = torch.FloatTensor(inputs) inputs = inputs.unsqueeze(0).permute( 0, 2, 1) # (batch_size, embedding_size, seq_len) if torch.cuda.is_available(): inputs = inputs.cuda() inputs = Variable(inputs) doc_embedding = net(inputs)[0].cpu().detach().numpy() doc_embedding = doc_embedding / LA.norm(doc_embedding) embedding_items[newsId] = doc_embedding except Exception as e: print(e) with open(params.embedding_path, mode='wb') as fp: pickle.dump(embedding_items, fp, pickle.HIGHEST_PROTOCOL) def item_sim(id1, id2): return np.dot(embedding_items.get(id1, np.zeros(embedding_size)), embedding_items.get(id2, np.zeros(embedding_size))).item() while True: item_id = input("\nNhập vào ID cua bài viết: ").strip() if item_id == "": break if item_id not in embedding_items: print("ID không tồn tại") continue print("Bài đang xét: " + items[item_id]['title_token']) def custom_comparator(id1, id2): score = item_sim(item_id, id2) - item_sim(item_id, id1) if score > 0: return 1 if score == 0: return 0 return -1 candidate_items = embedding_items.copy() candidate_items.pop(item_id) sorted_ids = sorted(candidate_items.keys(), key=functools.cmp_to_key( lambda id1, id2: custom_comparator(id1, id2))) print("Danh sách top 10 bài liên quan được gợi ý:") count = 0 i = 0 title_set = set(normalize(items[item_id]['title_token'])) while count < 10: title = normalize(items[sorted_ids[i]]['title_token']) i += 1 if title in title_set: continue count += 1 title_set.add(title) print("{}. {}".format(count, items[sorted_ids[i]]['title_token']))
def fit(self, X, y): timer = Stopwatch(verbose=False).start() X, y = self._check_X_y(X, y) unique_params = self.unique_params() tts = TrainTestSplitter(**self.train_test_splitter_params) number_of_combinations = self.number_of_combinations() total_iter = self.n_splits * number_of_combinations current_iter_width = len(str(total_iter)) if self.verbose: print "Training {0} on {1} samples x {2} features.".format( self.model.model_name(), *X.shape) print "{0}-fold CV for each of {1} params combinations == {2} fits ...\n"\ .format(self.n_splits, number_of_combinations, total_iter) # initialize `cv_results_` self.cv_results_['mean_score'] = [] self.cv_results_['std_score'] = [] self.cv_results_['params'] = [] for k in xrange(self.n_splits): self.cv_results_['split{0}_score'.format(k)] = [] self.cv_results_['split{0}_train_time'.format(k)] = [] self.cv_results_['split{0}_test_time'.format(k)] = [] for param_name in unique_params: self.cv_results_['param_{0}'.format(param_name)] = ma.array([]) current_iter = 0 if self.refit: # for each param combination fit consequently on each fold # to obtain mean score across splits as soon as possible for params_index, params in enumerate(self.gen_params()): # set params and add to `cv_results_` self.model.reset_params().set_params(**params) self.cv_results_['params'].append(params) for param_name in unique_params: cv_key = 'param_{0}'.format(param_name) mask = [int(not param_name in params)] to_concat = ma.array([params.get(param_name, None)], mask=mask) self.cv_results_[cv_key] = ma.concatenate( (self.cv_results_[cv_key], to_concat)) splits_scores = [] for split_index, (train, test) in enumerate( tts.k_fold_split(y, n_splits=self.n_splits, stratify=True)): # verbosing if self.verbose: current_iter += 1 t = "iter: {0:{1}}/{2} ".format( current_iter, current_iter_width, total_iter) t += '+' * (split_index + 1) + '-' * (self.n_splits - split_index - 1) print_inline(t) # fit and evaluate with Stopwatch(verbose=False) as s: self.model.fit(X[train], y[train]) self.cv_results_['split{0}_train_time'.format( split_index)].append(s.elapsed()) with Stopwatch(verbose=False) as s: score = self.model.evaluate(X[test], y[test]) self.cv_results_['split{0}_test_time'.format( split_index)].append(s.elapsed()) # score = self.scoring(y[test], y_pred) splits_scores.append(score) # add score to `cv_results_` self.cv_results_['split{0}_score'.format( split_index)].append(score) # verbosing if self.verbose: print_inline(" elapsed: {0} sec".format( width_format(timer.elapsed(), default_width=7))) if split_index < self.n_splits - 1: t = "" if self.best_score_ > -np.inf: t += " - best acc.: {0:.4f} at {1}" \ .format(self.best_score_, self.best_params_) else: t += " ..." print t # compute mean and std score mean_score = np.mean(splits_scores) std_score = np.std(splits_scores) self.cv_results_['mean_score'].append(mean_score) self.cv_results_['std_score'].append(std_score) # update 'best' attributes if mean_score > self.best_score_: self.best_index_ = params_index self.best_score_ = mean_score self.best_std_ = std_score self.best_params_ = params self.best_model_ = self.model if self.save_models: self.best_model_.save(filepath=os.path.join( self.dirpath, self._best_model_name()), **self.save_params) # verbosing if self.verbose: print_inline( " - mean acc.: {0:.4f} +/- 2 * {1:.3f}\n".format( mean_score, std_score)) else: # if self.refit == False # fit for each fold and then evaluate on each combination # of params for split_index, (train, test) in enumerate( tts.k_fold_split(y, n_splits=self.n_splits, stratify=True)): current_best_score = -np.inf current_best_params = None for params_index, params in enumerate(self.gen_params()): # set params self.model.reset_params().set_params(**params) # fit model (only once per split) if params_index == 0: with Stopwatch(verbose=False) as s: self.model.fit(X[train], y[train]) # on first split add params to `cv_results_` if split_index == 0: # store params' values self.cv_results_['params'].append(params) for param_name in unique_params: cv_key = 'param_{0}'.format(param_name) mask = [int(not param_name in params)] to_concat = ma.array( [params.get(param_name, None)], mask=mask) self.cv_results_[cv_key] = ma.concatenate( (self.cv_results_[cv_key], to_concat)) # write training time self.cv_results_['split{0}_train_time'.format(split_index)]\ .append(s.elapsed() if params_index == 0 else 0.) # evaluate with Stopwatch(verbose=False) as s: score = self.model.evaluate(X[test], y[test]) self.cv_results_['split{0}_test_time'.format( split_index)].append(s.elapsed()) # score = self.scoring(y[test], y_pred) # add score to `cv_results_` cv_key = 'split{0}_score'.format(split_index) self.cv_results_[cv_key].append(score) # update "current" best score and params current_mean_score = np.mean([ self.cv_results_['split{0}_score'.format(k)] [params_index] for k in xrange(split_index + 1) ]) if current_mean_score > current_best_score: current_best_score = current_mean_score current_best_params = params # verbosing if self.verbose: current_iter += 1 t = "iter: {0:{1}}/{2} ".format( current_iter, current_iter_width, total_iter) t += '+' * (split_index + 1) + '-' * (self.n_splits - split_index - 1) t += " elapsed: {0} sec".format( width_format(timer.elapsed(), default_width=7)) if split_index < self.n_splits - 1: t += " - best acc.: {0:.4f} [{1}/{2} splits] at {3}"\ .format(current_best_score, split_index + 1, self.n_splits, current_best_params) print_inline(t) if split_index < self.n_splits - 1: print # after last split ... if split_index == self.n_splits - 1: # ... compute means, stds splits_scores = [ self.cv_results_['split{0}_score'.format(k)] [params_index] for k in xrange(self.n_splits) ] mean_score = np.mean(splits_scores) std_score = np.std(splits_scores) self.cv_results_['mean_score'].append(mean_score) self.cv_results_['std_score'].append(std_score) # ... and update best attributes if mean_score > self.best_score_: self.best_index_ = params_index self.best_score_ = mean_score self.best_std_ = std_score self.best_params_ = params self.best_model_ = self.model if self.save_models: self.best_model_.save(filepath=os.path.join( self.dirpath, self._best_model_name()), **self.save_params) # verbosing if self.verbose: print_inline( " - best acc.: {0:.4f} +/- 2 * {1:.3f} at {2}\n" .format(self.best_score_, self.best_std_, self.best_params_)) # convert lists to np.ndarray for key in (['mean_score', 'std_score', 'params'] + [ 'split{0}_{1}'.format(k, s) for k in xrange(self.n_splits) for s in ('score', 'train_time', 'test_time') ]): self.cv_results_[key] = np.asarray(self.cv_results_[key]) return self