예제 #1
0
    def train_we(self, pos_src, neg_src, save_path):
        from utils.io import ensure_dir
        ensure_dir(output_dir)
        logger.info("building corpus")

        train_corpus = []
        i = 0  # tag number
        for fname in [pos_src, neg_src]:
            with smart_open.smart_open(fname, encoding="iso-8859-1") as f:
                # For training data, add tags
                logger.info("processing " + fname)
                for line in f:
                    train_corpus.append(
                        gensim.models.doc2vec.TaggedDocument(
                            gensim.utils.simple_preprocess(line), [i]))
                    i += 1
                    if i % 10000 == 0:
                        logger.info("added %d lines" % i)

        logger.info("built train_corpus of length %d" % len(train_corpus))
        # print(train_corpus)

        epochs = 20  # typically 10~20
        model = gensim.models.doc2vec.Doc2Vec(train_corpus,
                                              vector_size=300,
                                              min_count=2,
                                              epochs=epochs)
        model.save(save_path)

        return model
    def __init__(self, model, output, name, no_cuda):

        super().__init__()

        self.model = model
        self.training_name = name
        self.save_dir = output
        self.output_name = name
        self.save_dir = io.abs_path(output)
        self.with_cuda = not no_cuda
        self.single_gpu = True
        path = os.path.join(self.save_dir, self.training_name, 'log_test')
        self.model_logger = ModelLogger(path, self.training_name)
        path = os.path.join(self.save_dir, self.training_name,
                            'log_results_test')
        self.train_logger = TrainingLogger(path)
        if not torch.cuda.is_available():
            self.with_cuda = False
            self._logger.info("GPU not available set no cuda = True")
        if self.with_cuda:
            self._logger.info("GPU is used!")
            self.model = self.model.cuda()
            if torch.cuda.device_count() > 1:
                if self.verbosity:
                    self._logger.info("Let's use %d GPUs!",
                                      torch.cuda.device_count())

                #parallelise
                self.single_gpu = False
                self.model.parallelise()  #this might be slower
                self.model = torch.nn.DataParallel(self.model)

        io.ensure_dir(os.path.join(self.save_dir, self.training_name))
예제 #3
0
    def save_batch_images(self,
                          name,
                          image,
                          idx,
                          image_pred=None,
                          image_target=None,
                          pose_pred=None,
                          pose_gt=None):

        name = name + "_images"
        self.record_index(name, idx)
        dir_path = os.path.join(self.path, name)
        ensure_dir(dir_path)
        path = os.path.join(dir_path, '%s.npy' % idx)
        image = image_pytorch_to_numpy(image, True)
        np.save(path, image)
        if image_target is not None:
            path = os.path.join(dir_path, '%sT.npy' % idx)
            image_target = image_pytorch_to_numpy(image_target, True)
            np.save(path, image_target)
        if image_pred is not None:
            path = os.path.join(dir_path, '%sT_gt.npy' % idx)
            image_pred = image_pytorch_to_numpy(image_pred, True)
            np.save(path, image_pred)
        if pose_pred is not None:
            path = os.path.join(dir_path, '%spose.npy' % idx)
            pose_pred = tensor_to_numpy(pose_pred)
            np.save(path, pose_pred)
        if pose_gt is not None:
            path = os.path.join(dir_path, '%spose_gt.npy' % idx)
            pose_gt = tensor_to_numpy(pose_gt)
            np.save(path, pose_gt)
예제 #4
0
    def __init__(self, dir_path, training_name):
        self.dir_path = dir_path
        ensure_dir(self.dir_path)
        self.training_name = training_name

        self.train = SummaryWriter(os.path.join(self.dir_path, 'train'),
                                   comment=self.training_name)
        self.val = SummaryWriter(os.path.join(self.dir_path, 'val'),
                                 comment=self.training_name)
예제 #5
0
 def __init__(
     self,
     dir_path,
 ):
     self.dir_path = dir_path
     ensure_dir(self.dir_path)
     self._logger = ConsoleLogger()
     self.path = os.path.join(self.dir_path, 'training_logger/')
     ensure_dir(self.path)
     self.scalars = {}
     self.scalars_saved = 0
    def __init__(self, model, optimizer, no_cuda, eval_epoch, epochs,
                 name, output, save_freq, verbosity,
                 train_log_step, verbosity_iter,reset=False, **kwargs):
        """Init class"""

        super().__init__()


        self.model = model
        self.optimizer = optimizer
        self.epochs = epochs
        self.training_name = name
        self.save_dir = output
        self.save_freq = save_freq
        self.with_cuda = not no_cuda
        self.verbosity = verbosity
        self.verbosity_iter = verbosity_iter
        self.train_log_step = train_log_step
        self.eval_epoch = eval_epoch
        self.min_loss = math.inf
        self.start_epoch = 0
        self.start_iteration = 0
        path=os.path.join(self.save_dir,self.training_name,'log')
        self.model_logger = ModelLogger(path,self.training_name)
        path = os.path.join(self.save_dir,self.training_name,'log_results')
        self.train_logger = TrainingLogger(path)

        self.training_info = None
        self.reset = reset
        self.single_gpu = True
        self.global_step = 0

        # check that we can run on GPU

        if not torch.cuda.is_available():
            self.with_cuda = False
            self._logger.info("GPU not available set no cuda = True")
        if self.with_cuda:
            self._logger.info("GPU is used!")
            if torch.cuda.device_count() > 1:
                if self.verbosity:
                    self._logger.info("Let's use %d GPUs!",
                                      torch.cuda.device_count())

                #parallelise
                self.single_gpu = False
                self.model.parallelise() #this might be slower
                self.model = torch.nn.DataParallel(self.model)

        io.ensure_dir(os.path.join(self.save_dir,
                                   self.training_name))
    def evaluate_for_kaggle(self, output_src):
        import logging
        logger = logging.getLogger(self.__class__.__name__)
        logger.info("Predicting on test set...")
        predictions = self.predict(self.data_source.testX)

        logger.info("Writing to submission file...")
        from utils.io import ensure_dir
        ensure_dir(output_src)
        with open(output_src, "w") as f:
            f.write("Id,Prediction\n")
            for idx, val in enumerate(predictions):
                f.write("%d,%d\n" % (idx + 1, val))
        logger.info("Submission file saved: " + output_src)
예제 #8
0
 def save_dic(self, name, dic_in, idx, extra_str="_", gpu=True):
     dic = {}
     if extra_str == "_":
         self.record_index(name, idx)
     if gpu:
         for key in dic_in.keys():
             if type(dic_in[key]) == tuple:
                 dic[key] = (dic_in[key][0].cpu().data.numpy(),
                             dic_in[key][1].cpu().data.numpy())
             else:
                 dic[key] = dic_in[key].cpu().data.numpy()
     dir_path = os.path.join(self.path, name)
     ensure_dir(dir_path)
     string = 'dic_%s' % idx
     string = extra_str + string
     path_in = os.path.join(dir_path, string)
     pkl.dump(dic, open(path_in, "wb"))
예제 #9
0
def load_config() -> dict:
    """Load configuration file

    Returns:
        dict: dictionary with project configuration information
    """

    # path with project configs
    config_path = join(ROOT_DIR, 'config/general.yml')
    if not exists(config_path):
        raise Exception('File {} does not exist!'.format(config_path))

    with open(config_path) as fin:
        config_data = edict(yaml.safe_load(fin))

    # fix paths wrt project root dir path
    for key, val in config_data.dirs.items():
        config_data.dirs[key] = ensure_dir(abspath(join(ROOT_DIR, val)))

    config_data.dirs.root = abspath(ROOT_DIR)

    return config_data
예제 #10
0
    def train_we(self, pos_src, neg_src, save_path):
        import subprocess
        from utils.io import ensure_dir
        ensure_dir(output_dir)

        vocab_path = output_dir + "vocab.txt"
        logger.info(
            "building vocab.txt: " +
            r'cat %s %s | sed "s/ /\n/g" | grep -v "^\s*$" | sort | uniq -c > %s'
            % (pos_src, neg_src, vocab_path))
        subprocess.call([
            "bash", "-c",
            r'cat %s %s | sed "s/ /\n/g" | grep -v "^\s*$" | sort | uniq -c > %s'
            % (pos_src, neg_src, vocab_path)
        ])

        vocab_cut_path = output_dir + "vocab_cut.txt"
        logger.info(
            "building vocab_cut.txt: " +
            r'cat %s | sed "s/^\s\+//g" | sort -rn | grep -v "^[1234]\s" | cut -d" " -f2 > %s'
            % (vocab_path, vocab_cut_path))
        subprocess.call([
            "bash", "-c",
            r'cat %s | sed "s/^\s\+//g" | sort -rn | grep -v "^[1234]\s" | cut -d" " -f2 > %s'
            % (vocab_path, vocab_cut_path)
        ])

        vocab = dict()
        with open(vocab_cut_path) as f:
            for idx, line in enumerate(f):
                vocab[line.strip()] = idx
        vocab_size = len(vocab)

        data, row, col = [], [], []
        counter = 1
        for fn in [pos_src, neg_src]:
            with open(fn) as f:
                for line in f:
                    tokens = [vocab.get(t, -1) for t in line.strip().split()]
                    tokens = [t for t in tokens if t >= 0]
                    for t in tokens:
                        for t2 in tokens:
                            data.append(1)
                            row.append(t)
                            col.append(t2)

                    if counter % 10000 == 0:
                        logger.info("read %d samples" % counter)
                    counter += 1
        cooc = coo_matrix((data, (row, col)))
        logger.info("summing duplicates (this can take a while)")
        cooc.sum_duplicates()
        # with open('cooc.pkl', 'wb') as f:
        #     pickle.dump(cooc, f, pickle.HIGHEST_PROTOCOL)
        # print("loading cooccurrence matrix")
        # with open('cooc.pkl', 'rb') as f:
        #     cooc = pickle.load(f)
        logger.info("{} nonzero entries".format(cooc.nnz))

        nmax = 100
        logger.info("using nmax = %d, cooc.max() = %s" %
                    (nmax, str(cooc.max())))

        logger.info("initializing embeddings")
        logger.info("cooc shape 0: %s, cooc shape 1: %s" %
                    (str(cooc.shape[0]), str(cooc.shape[1])))
        embedding_dim = 300
        xs = np.random.normal(size=(cooc.shape[0], embedding_dim))
        ys = np.random.normal(size=(cooc.shape[1], embedding_dim))

        eta = 0.001
        alpha = 3 / 4

        epochs = 5

        for epoch in range(epochs):
            logger.info("epoch {}".format(epoch))
            for ix, jy, n in zip(cooc.row, cooc.col, cooc.data):
                logn = np.log(n)
                fn = min(1.0, (n / nmax)**alpha)
                x, y = xs[ix, :], ys[jy, :]
                scale = 2 * eta * fn * (logn - np.dot(x, y))
                xs[ix, :] += scale * y
                ys[jy, :] += scale * x

        we = xs + ys
        with open(save_path, "w") as f, open(vocab_cut_path, "r") as g:
            f.write(str(cooc.shape[0]) + " " + str(embedding_dim) + "\n")
            for i, word in enumerate(g):
                coords = ' '.join([str(b) for b in we[i].tolist()])
                f.write(word.strip() + " " + coords + "\n")

        model = KeyedVectors.load_word2vec_format(save_path)

        return model