Пример #1
0
def copy_vectorizer(dataset_name: str, dest_model_name: str) -> bool:
    """Copies the vectorizer from a given dataset to destination model

    Parameters
    ----------
    dataset_name : str
        Qualified dataset name
    dest_model_name : str
        Qualified model name

    Returns
    -------
    bool
        True if the copy succeeded, False if the dataset doesn't have a vectorizer or its vectorizer is unavailable

    """
    config = load_config()
    data_home = config["model"]["data_home"]
    model_home = config["model"]["model_home"]

    src = f"{data_home}/{dataset_name}/vectorizer.pkl"
    dest = f"{model_home}\\{dest_model_name}\\vectorizer.pkl"

    try:
        shutil.copyfile(src, dest)
        return True
    except FileNotFoundError:
        print(f"Dataset {dataset_name} has no defined vectorizer. It will not be usable for searching")
        return False
Пример #2
0
    def run(self):
        self.status.emit(f"Loading model '{self.model_name}'")

        # Load model and vectorizer
        model, vectorizer = vdsh.utility.load_model(self.model_name)

        # Reset the mode before refitting if it has already been fit
        if model.meta.is_fit:
            model_home = load_config()["model"]["model_home"]
            source = f"{model_home}/{self.model_name}"
            dest = f"{model_home}/{self.model_name}__swap"
            os.rename(source, dest)
            os.mkdir(source)

            model.meta.info["fit"] = False
            model.meta.info["fit_dataset"] = ""
            model.meta.info["fit_time"] = ""
            model.meta.dump(source)

            shutil.rmtree(dest)

            model, vectorizer = vdsh.utility.load_model(self.model_name)

        self.status.emit(
            f"Model loaded. Extracting train from '{self.dataset_name}' dataset"
        )

        # Extract train dataset
        X = extract_train(self.dataset_name)

        self.status.emit("Compiling the model...")

        lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
            self.initial_rate,
            decay_steps=self.decay_steps,
            decay_rate=self.decay_rate,
            staircase=True)

        if self.optimizer == "adam":
            opt = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
        else:
            raise NotImplementedError("No such optimizer")

        model.compile(optimizer=opt)

        self.status.emit(TRAINING_START_MSG)

        model.fit(X,
                  epochs=self.epochs,
                  batch_size=self.batch_size,
                  callbacks=[self.progbar_callback])

        # Flag model fit
        model.meta.flag_fit(self.dataset_name)

        # Fitted model is saved, and marked fit, it cannot be fit once more without copying
        vdsh.utility.dump_model(model)

        self.status.emit(MODEL_SAVED_MSG)
        self.finished.emit()
def scan_datasets() -> list[DatasetMetaInfo]:
    """Returns the list of DatasetMetaInfo for dirs in data_home,
     if there is no meta.json provided name and path is inferred"""
    data_home = usersetup.load_config()["model"]["data_home"]

    result: list[DatasetMetaInfo] = list()

    # Get the list of paths and dataset names, skip the first dir (the parent dir)
    datasets = os.walk(data_home)
    next(datasets)

    for d in datasets:
        path, _, files = d

        name = path.split("\\")[-1]

        if META_FILE_NAME in files and DATA_FILE_NAME in files:
            mi = DatasetMetaInfo.from_file(path)
        elif DATA_FILE_NAME in files:
            mi = DatasetMetaInfo(name, np.NAN, np.NAN, np.NAN)
        else:
            mi = DatasetMetaInfo.undefined_preset(name)

        result.append(mi)

    return result
Пример #4
0
def check_model_available(name: str) -> Optional[ModelMetaInfo]:
    """Returns model meta info if it exists, else None"""
    model_home = load_config()["model"]["model_home"]
    dest = f"{model_home}/{name}"

    try:
        return ModelMetaInfo.from_file(dest)
    except OSError:
        return None
Пример #5
0
def check_dataset_available(name: str) -> Optional[DatasetMetaInfo]:
    """Returns dataset meta info if it exists, else None"""
    data_home = load_config()["model"]["data_home"]
    dest = f"{data_home}/{name}"

    try:
        return DatasetMetaInfo.from_file(dest)
    except OSError:
        return None
    def run(self):
        self.status.emit(f"Loading model '{self._model_name}'...")
        self.progress.emit(PROGRESS_LOADING)

        model, _ = vdsh.utility.load_model(self._model_name)

        self.status.emit(f"Loading data...")
        self.progress.emit(PROGRESS_LOADING_MODEL)

        data_home = load_config()["model"]["data_home"]
        try:
            with h5py.File(f"{data_home}/{self._dataset_name}/data.hdf5", "r") as hf:
                train: np.ndarray = hf["train"][:]
                train_targets: np.ndarray = hf["train_labels"][:]
                test: np.ndarray = hf["test"][:]
                test_targets: np.ndarray = hf["test_labels"][:]

                self.status.emit(f"Running predict...")
                self.progress.emit(PROGRESS_IO_COMPLETE)

                train_pred = model.predict(train)
                test_pred = model.predict(test)

                self.status.emit(f"Transforming to binary codes")
                self.progress.emit(PROGRESS_AFTER_PREDICT)

                train_codes = medhash_transform(train_pred)
                test_codes = medhash_transform(test_pred)

                current_progress = PROGRESS_AFTER_PREDICT
                end_progress = TARGET_PROGRESS
                steps = len(test_codes)
                progress_per_step = (end_progress - current_progress) / steps

                self.status.emit(f"Running metrics tests...")

                precision_scores = []

                for idx, tc in enumerate(test_codes):
                    r = precision(test_targets[idx], train_targets, top_k_indices(tc, train_codes, self._k)[0], self._k)
                    precision_scores.append(r)

                    current_progress += progress_per_step
                    self.progress.emit(math.floor(current_progress))

                mean_precision = np.array(precision_scores).mean()
                self.precisionResult.emit(mean_precision)

                self.progress.emit(FINISHED_PROGRESS)
                self.finished.emit()
                self.status.emit("Finished")
        except (IOError, OSError):
            print(f"Cannot reach data.hdf5 in {self._dataset_name}")

            self.progress.emit(0)
            self.finished.emit()
            self.status.emit("Failed to read data")
Пример #7
0
def create_20ng(vocab_size: int, name: str = "20ng"):
    """Fetches 20ng dataset in plaintext thanks to sklearn, then uses custom Tfidf vectorizer
    to encode the dataset according to specified vocab_size

    Parameters
    ----------
    vocab_size : int
        Target vocabulary size of the dataset
    name : str
        Output name of the dataset, default '20ng'

    Returns
    -------
    None

    """
    try:
        data_home = load_config()["model"]["data_home"]
        dest = f"{data_home}/{name}"

        try:
            os.mkdir(f"{data_home}/{name}")
        except FileExistsError:
            pass

        print("Fetching 20ng...")
        train = fetch_20newsgroups(subset="train", remove=("headers", "footers", "quotes"))
        test = fetch_20newsgroups(subset="test", remove=("headers", "footers", "quotes"))

        print("Vectorizing...")
        v = TfidfVectorizer(stop_words="english", max_features=vocab_size)

        # Scipy sparse matrices
        sparse_train_tfidf: scipy.sparse.csr.csr_matrix = v.fit_transform(train.data)
        sparse_test_tfidf = v.transform(test.data)

        print("Saving dataset...")
        with h5py.File(f"{dest}/data.hdf5", "w") as hf:
            hf.create_dataset(name="train", data=sparse_train_tfidf.toarray(), compression="gzip")
            hf.create_dataset(name="train_labels", data=train.target)
            hf.create_dataset(name="test", data=sparse_test_tfidf.toarray(), compression="gzip")
            hf.create_dataset(name="test_labels", data=test.target)

        mi = DatasetMetaInfo(name,
                             vocab_size,
                             num_train=sparse_train_tfidf.shape[0],
                             num_test=sparse_test_tfidf.shape[0],
                             num_labels=1)

        mi.dump(dest)

    except (KeyError, IOError):
        print("Couldn't read config.json file")
Пример #8
0
def load_model(model_name: str) -> tuple[VDSH, DocumentVectorizer]:
    """Loads the model from model_home/model_name

    If the model itself exists it is returned itself, compiled and ready to use

    If the model doesn't exist but there is meta info file, then the model is created ad hoc
    and returned as well

    Parameters
    ----------
    model_name : str
        A qualified model name

    Returns
    -------
    tuple[VDSH, DocumentVectorizer]
        Retrieved model and the vectorizer if present, else None
    """
    config = load_config()
    model_home = config["model"]["model_home"]

    mi = ModelMetaInfo.from_file(f"{model_home}/{model_name}")

    try:
        model = tf.keras.models.load_model(f"{model_home}/{model_name}")
    except OSError:
        print("Model not found. Creating model...")
        model = create_vdsh(mi.vocab_size,
                            mi.hidden_dim,
                            mi.latent_dim,
                            mi.kl_step,
                            mi.dropout_prob,
                            mi.name)

    # Push meta info to model
    model.meta = mi

    try:
        vec = storage.load_vectorizer(f"{model_home}/{model_name}")
    except (FileNotFoundError, IOError):
        print("Vectorizer not found")
        vec = None

    print("Model loaded:")
    print(model.meta.info)

    return model, vec
Пример #9
0
def remove_model(name: str) -> bool:
    """Permanently removes the model from files at model_home

        Parameters
        ----------
        name : str
            Qualified model name

        Returns
        -------
        bool
            True if deletion succeeded, False otherwise

        """
    model_home = load_config()["model"]["model_home"]
    dirpath = f"{model_home}/{name}"
    return _remove_entity(dirpath)
def scan_models() -> list[ModelMetaInfo]:
    """Returns the list of ModelMetaInfo for dirs in model_home,
    if there is no meta.json provided name and path is inferred"""
    model_home = usersetup.load_config()["model"]["model_home"]

    result: list[ModelMetaInfo] = list()

    # Get the list of paths and dataset names, skip the first dir (the parent dir)
    model_paths = glob(f"{model_home}/*/")

    for path in model_paths:
        name = path.split("\\")[-2]

        try:
            mi = ModelMetaInfo.from_file(path)
            result.append(mi)
        except FileNotFoundError:
            mi = ModelMetaInfo(name)
            result.append(mi)

    return result
Пример #11
0
def dump_model(model: VDSH):
    """Dumps the model and the vectorizer at data_home/fit_dataset to model_home/model.meta.name

    Intended to use after fitting a model with a given dataset
    Saved model cannot be refitted directly but has to be copied

    Parameters
    ----------
    model : VDSH
        Model with meta info

    Returns
    -------
    None
    """
    config = load_config()
    model_home = config["model"]["model_home"]

    # Infer export model name and dataset name from meta info
    mi = model.meta
    model_name = mi.name
    dataset_name = mi.dataset_name

    model_dest = f"{model_home}/{model_name}"

    try:
        os.mkdir(model_dest)
    except FileExistsError:
        pass

    # Running predict to set up weights
    vocab_size = mi.vocab_size
    model.predict(np.zeros(shape=(1, vocab_size)))

    model.save(model_dest)
    mi.dump(model_dest)

    if dataset_name:
        datasets.copy_vectorizer(dataset_name, model_name)
Пример #12
0
def extract_train(dataset_name: str) -> Optional[np.ndarray]:
    """Extracts train subset of a given dataset if available

    Parameters
    ----------
    dataset_name : str
        Qualified dataset name

    Returns
    -------
    Optional[np.ndarray]
        The train subset of the dataset, a numpy ndarray with tfidf vectors as rows

    """
    data_home = load_config()["model"]["data_home"]

    try:
        with h5py.File(f"{data_home}/{dataset_name}/data.hdf5", "r") as hf:
            train: np.ndarray = hf["train"][:]
            return train
    except (IOError, OSError):
        print(f"Cannot reach data.hdf5 in {dataset_name}")
        return None
Пример #13
0
def check_model_has_vectorizer(model_name: str):
    """Returns True if specified model has a vectorizer assigned, False otherwise"""
    data_home = load_config()["model"]["model_home"]

    return os.path.isfile(f"{data_home}/{model_name}/vectorizer.pkl")