def train_we(self, pos_src, neg_src, save_path): from utils.io import ensure_dir ensure_dir(output_dir) logger.info("building corpus") train_corpus = [] i = 0 # tag number for fname in [pos_src, neg_src]: with smart_open.smart_open(fname, encoding="iso-8859-1") as f: # For training data, add tags logger.info("processing " + fname) for line in f: train_corpus.append( gensim.models.doc2vec.TaggedDocument( gensim.utils.simple_preprocess(line), [i])) i += 1 if i % 10000 == 0: logger.info("added %d lines" % i) logger.info("built train_corpus of length %d" % len(train_corpus)) # print(train_corpus) epochs = 20 # typically 10~20 model = gensim.models.doc2vec.Doc2Vec(train_corpus, vector_size=300, min_count=2, epochs=epochs) model.save(save_path) return model
def __init__(self, model, output, name, no_cuda): super().__init__() self.model = model self.training_name = name self.save_dir = output self.output_name = name self.save_dir = io.abs_path(output) self.with_cuda = not no_cuda self.single_gpu = True path = os.path.join(self.save_dir, self.training_name, 'log_test') self.model_logger = ModelLogger(path, self.training_name) path = os.path.join(self.save_dir, self.training_name, 'log_results_test') self.train_logger = TrainingLogger(path) if not torch.cuda.is_available(): self.with_cuda = False self._logger.info("GPU not available set no cuda = True") if self.with_cuda: self._logger.info("GPU is used!") self.model = self.model.cuda() if torch.cuda.device_count() > 1: if self.verbosity: self._logger.info("Let's use %d GPUs!", torch.cuda.device_count()) #parallelise self.single_gpu = False self.model.parallelise() #this might be slower self.model = torch.nn.DataParallel(self.model) io.ensure_dir(os.path.join(self.save_dir, self.training_name))
def save_batch_images(self, name, image, idx, image_pred=None, image_target=None, pose_pred=None, pose_gt=None): name = name + "_images" self.record_index(name, idx) dir_path = os.path.join(self.path, name) ensure_dir(dir_path) path = os.path.join(dir_path, '%s.npy' % idx) image = image_pytorch_to_numpy(image, True) np.save(path, image) if image_target is not None: path = os.path.join(dir_path, '%sT.npy' % idx) image_target = image_pytorch_to_numpy(image_target, True) np.save(path, image_target) if image_pred is not None: path = os.path.join(dir_path, '%sT_gt.npy' % idx) image_pred = image_pytorch_to_numpy(image_pred, True) np.save(path, image_pred) if pose_pred is not None: path = os.path.join(dir_path, '%spose.npy' % idx) pose_pred = tensor_to_numpy(pose_pred) np.save(path, pose_pred) if pose_gt is not None: path = os.path.join(dir_path, '%spose_gt.npy' % idx) pose_gt = tensor_to_numpy(pose_gt) np.save(path, pose_gt)
def __init__(self, dir_path, training_name): self.dir_path = dir_path ensure_dir(self.dir_path) self.training_name = training_name self.train = SummaryWriter(os.path.join(self.dir_path, 'train'), comment=self.training_name) self.val = SummaryWriter(os.path.join(self.dir_path, 'val'), comment=self.training_name)
def __init__( self, dir_path, ): self.dir_path = dir_path ensure_dir(self.dir_path) self._logger = ConsoleLogger() self.path = os.path.join(self.dir_path, 'training_logger/') ensure_dir(self.path) self.scalars = {} self.scalars_saved = 0
def __init__(self, model, optimizer, no_cuda, eval_epoch, epochs, name, output, save_freq, verbosity, train_log_step, verbosity_iter,reset=False, **kwargs): """Init class""" super().__init__() self.model = model self.optimizer = optimizer self.epochs = epochs self.training_name = name self.save_dir = output self.save_freq = save_freq self.with_cuda = not no_cuda self.verbosity = verbosity self.verbosity_iter = verbosity_iter self.train_log_step = train_log_step self.eval_epoch = eval_epoch self.min_loss = math.inf self.start_epoch = 0 self.start_iteration = 0 path=os.path.join(self.save_dir,self.training_name,'log') self.model_logger = ModelLogger(path,self.training_name) path = os.path.join(self.save_dir,self.training_name,'log_results') self.train_logger = TrainingLogger(path) self.training_info = None self.reset = reset self.single_gpu = True self.global_step = 0 # check that we can run on GPU if not torch.cuda.is_available(): self.with_cuda = False self._logger.info("GPU not available set no cuda = True") if self.with_cuda: self._logger.info("GPU is used!") if torch.cuda.device_count() > 1: if self.verbosity: self._logger.info("Let's use %d GPUs!", torch.cuda.device_count()) #parallelise self.single_gpu = False self.model.parallelise() #this might be slower self.model = torch.nn.DataParallel(self.model) io.ensure_dir(os.path.join(self.save_dir, self.training_name))
def evaluate_for_kaggle(self, output_src): import logging logger = logging.getLogger(self.__class__.__name__) logger.info("Predicting on test set...") predictions = self.predict(self.data_source.testX) logger.info("Writing to submission file...") from utils.io import ensure_dir ensure_dir(output_src) with open(output_src, "w") as f: f.write("Id,Prediction\n") for idx, val in enumerate(predictions): f.write("%d,%d\n" % (idx + 1, val)) logger.info("Submission file saved: " + output_src)
def save_dic(self, name, dic_in, idx, extra_str="_", gpu=True): dic = {} if extra_str == "_": self.record_index(name, idx) if gpu: for key in dic_in.keys(): if type(dic_in[key]) == tuple: dic[key] = (dic_in[key][0].cpu().data.numpy(), dic_in[key][1].cpu().data.numpy()) else: dic[key] = dic_in[key].cpu().data.numpy() dir_path = os.path.join(self.path, name) ensure_dir(dir_path) string = 'dic_%s' % idx string = extra_str + string path_in = os.path.join(dir_path, string) pkl.dump(dic, open(path_in, "wb"))
def load_config() -> dict: """Load configuration file Returns: dict: dictionary with project configuration information """ # path with project configs config_path = join(ROOT_DIR, 'config/general.yml') if not exists(config_path): raise Exception('File {} does not exist!'.format(config_path)) with open(config_path) as fin: config_data = edict(yaml.safe_load(fin)) # fix paths wrt project root dir path for key, val in config_data.dirs.items(): config_data.dirs[key] = ensure_dir(abspath(join(ROOT_DIR, val))) config_data.dirs.root = abspath(ROOT_DIR) return config_data
def train_we(self, pos_src, neg_src, save_path): import subprocess from utils.io import ensure_dir ensure_dir(output_dir) vocab_path = output_dir + "vocab.txt" logger.info( "building vocab.txt: " + r'cat %s %s | sed "s/ /\n/g" | grep -v "^\s*$" | sort | uniq -c > %s' % (pos_src, neg_src, vocab_path)) subprocess.call([ "bash", "-c", r'cat %s %s | sed "s/ /\n/g" | grep -v "^\s*$" | sort | uniq -c > %s' % (pos_src, neg_src, vocab_path) ]) vocab_cut_path = output_dir + "vocab_cut.txt" logger.info( "building vocab_cut.txt: " + r'cat %s | sed "s/^\s\+//g" | sort -rn | grep -v "^[1234]\s" | cut -d" " -f2 > %s' % (vocab_path, vocab_cut_path)) subprocess.call([ "bash", "-c", r'cat %s | sed "s/^\s\+//g" | sort -rn | grep -v "^[1234]\s" | cut -d" " -f2 > %s' % (vocab_path, vocab_cut_path) ]) vocab = dict() with open(vocab_cut_path) as f: for idx, line in enumerate(f): vocab[line.strip()] = idx vocab_size = len(vocab) data, row, col = [], [], [] counter = 1 for fn in [pos_src, neg_src]: with open(fn) as f: for line in f: tokens = [vocab.get(t, -1) for t in line.strip().split()] tokens = [t for t in tokens if t >= 0] for t in tokens: for t2 in tokens: data.append(1) row.append(t) col.append(t2) if counter % 10000 == 0: logger.info("read %d samples" % counter) counter += 1 cooc = coo_matrix((data, (row, col))) logger.info("summing duplicates (this can take a while)") cooc.sum_duplicates() # with open('cooc.pkl', 'wb') as f: # pickle.dump(cooc, f, pickle.HIGHEST_PROTOCOL) # print("loading cooccurrence matrix") # with open('cooc.pkl', 'rb') as f: # cooc = pickle.load(f) logger.info("{} nonzero entries".format(cooc.nnz)) nmax = 100 logger.info("using nmax = %d, cooc.max() = %s" % (nmax, str(cooc.max()))) logger.info("initializing embeddings") logger.info("cooc shape 0: %s, cooc shape 1: %s" % (str(cooc.shape[0]), str(cooc.shape[1]))) embedding_dim = 300 xs = np.random.normal(size=(cooc.shape[0], embedding_dim)) ys = np.random.normal(size=(cooc.shape[1], embedding_dim)) eta = 0.001 alpha = 3 / 4 epochs = 5 for epoch in range(epochs): logger.info("epoch {}".format(epoch)) for ix, jy, n in zip(cooc.row, cooc.col, cooc.data): logn = np.log(n) fn = min(1.0, (n / nmax)**alpha) x, y = xs[ix, :], ys[jy, :] scale = 2 * eta * fn * (logn - np.dot(x, y)) xs[ix, :] += scale * y ys[jy, :] += scale * x we = xs + ys with open(save_path, "w") as f, open(vocab_cut_path, "r") as g: f.write(str(cooc.shape[0]) + " " + str(embedding_dim) + "\n") for i, word in enumerate(g): coords = ' '.join([str(b) for b in we[i].tolist()]) f.write(word.strip() + " " + coords + "\n") model = KeyedVectors.load_word2vec_format(save_path) return model