示例#1
0
def do_encode(args):
    """Generate text embeddings with XTransformer and save to file.

    Args:
        args (argparse.Namespace): Command line arguments parsed by `parser.parse_args()`
    """
    if os.path.isdir(args.save_emb_path):
        args.save_emb_path = os.path.join(args.save_emb_path, "embeddings.npy")

    xtf = XTransformer.load(args.model_folder)

    # load instance feature and text
    X_text = Preprocessor.load_data_from_file(args.text_path, label_text_path=None, text_pos=0)[
        "corpus"
    ]

    X_emb = xtf.encode(
        X_text,
        batch_size=args.batch_size,
        batch_gen_workers=args.batch_gen_workers,
        use_gpu=args.use_gpu,
        max_pred_chunk=args.max_pred_chunk,
    )

    smat_util.save_matrix(args.save_emb_path, X_emb)
示例#2
0
    def save_feature_matrix(tgt, feat_mat):
        """Save feature matrix to file

        Args:
            tgt (str or file-like object): destination to save the feature matrix
            feat_mat (sparse matrix or ndarray): feature matrix to save
        """
        smat_util.save_matrix(tgt, feat_mat)
示例#3
0
    def save(self, folder):
        """Save to disk.

        Args:
            folder (str): Folder to save to.
        """

        os.makedirs(folder, exist_ok=True)
        with open(os.path.join(folder, "config.json"), "w",
                  encoding="utf-8") as fout:
            fout.write(json.dumps({"len": len(self)}))

        for i, C in enumerate(self):
            smat_util.save_matrix(os.path.join(folder, f"C{i}.npz"), C)
示例#4
0
def do_predict(args):
    """Predict and Evaluate for HNSW model

    Args:
        args (argparse.Namespace): Command line arguments parsed by `parser.parse_args()`
    """

    # Load data
    Xt = smat_util.load_matrix(args.inst_path).astype(np.float32)

    # Load model
    model = HNSW.load(args.model_folder)

    # Setup HNSW Searchers for thread-safe inference
    threads = os.cpu_count() if args.threads <= 0 else args.threads
    searchers = model.searchers_create(num_searcher=threads)

    # Setup prediction params
    # pred_params.threads will be overrided if searchers are provided in model.predict()
    pred_params = HNSW.PredParams(
        efS=args.efSearch,
        topk=args.only_topk,
        threads=threads,
    )

    # Model Predicting
    Yt_pred = model.predict(
        Xt,
        pred_params=pred_params,
        searchers=searchers,
        ret_csr=True,
    )

    # Save prediction
    if args.save_pred_path:
        smat_util.save_matrix(args.save_pred_path, Yt_pred)

    # Evaluate Recallk@k
    if args.label_path:
        Yt = smat_util.load_matrix(args.label_path)
        # assuming ground truth is similarity-based (larger the better)
        Yt_topk = smat_util.sorted_csr(Yt, only_topk=args.only_topk)
        # assuming prediction matrix is distance-based, so need 1-dist=similiarty
        Yt_pred.data = 1.0 - Yt_pred.data
        metric = smat_util.Metrics.generate(Yt_topk,
                                            Yt_pred,
                                            topk=args.only_topk)
        print("Recall{}@{} {:.6f}%".format(args.only_topk, args.only_topk,
                                           100.0 * metric.recall[-1]))
示例#5
0
def do_predict(args):
    """Predict and Evaluate for xlinear model

    Args:
        args (argparse.Namespace): Command line arguments parsed by `parser.parse_args()`
    """

    # Load data
    Xt = XLinearModel.load_feature_matrix(args.inst_path)

    if args.selected_output is not None:
        # Selected Output
        selected_outputs_csr = XLinearModel.load_feature_matrix(
            args.selected_output)
        xlinear_model = XLinearModel.load(args.model_folder,
                                          is_predict_only=True,
                                          weight_matrix_type="CSC")
    else:
        # TopK
        selected_outputs_csr = None
        xlinear_model = XLinearModel.load(args.model_folder,
                                          is_predict_only=True)

    # Model Predicting
    Yt_pred = xlinear_model.predict(
        Xt,
        selected_outputs_csr=selected_outputs_csr,
        only_topk=args.only_topk,
        beam_size=args.beam_size,
        post_processor=args.post_processor,
        threads=args.threads,
        max_pred_chunk=args.max_pred_chunk,
    )

    # Save prediction
    if args.save_pred_path:
        smat_util.save_matrix(args.save_pred_path, Yt_pred)

    # Evaluate
    if args.label_path:
        Yt = XLinearModel.load_label_matrix(args.label_path)
        metric = smat_util.Metrics.generate(Yt, Yt_pred, topk=10)
        print("==== evaluation results ====")
        print(metric)
示例#6
0
def do_predict(args):
    """Predict with XTransformer and save the result.

    Args:
        args (argparse.Namespace): Command line arguments parsed by `parser.parse_args()`
    """
    if os.path.isdir(args.save_pred_path):
        args.save_pred_path = os.path.join(args.save_pred_path, "P.npz")

    torch_util.set_seed(args.seed)

    xtf = XTransformer.load(args.model_folder)

    # load instance feature and text
    if args.feat_path:
        X_feat = smat_util.load_matrix(args.feat_path)
    else:
        X_feat = None
    X_text = Preprocessor.load_data_from_file(args.text_path,
                                              label_text_path=None,
                                              text_pos=0)["corpus"]

    P_matrix = xtf.predict(
        X_text,
        X_feat=X_feat,
        batch_size=args.batch_size,
        batch_gen_workers=args.batch_gen_workers,
        use_gpu=args.use_gpu,
        beam_size=args.beam_size,
        only_topk=args.only_topk,
        post_processor=args.post_processor,
        max_pred_chunk=args.max_pred_chunk,
        threads=args.threads,
    )

    smat_util.save_matrix(args.save_pred_path, P_matrix)
示例#7
0
    def run(args):
        """Preprocess text using an existing preprocessor.

        Args:
            args (argparse.Namespace): Command line argument parsed by `parser.parse_args()`
        """
        preprocessor = Preprocessor.load(args.input_preprocessor_folder)
        if args.from_file and not args.output_label_path and not args.output_rel_path:
            Y, R = None, None
            corpus = args.input_text_path
        else:
            result = Preprocessor.load_data_from_file(
                args.input_text_path,
                label_text_path=args.label_text_path,
                maxsplit=args.maxsplit,
                text_pos=args.text_pos,
                label_pos=args.label_pos,
            )
            Y = result["label_matrix"]
            R = result["label_relevance"]
            corpus = result["corpus"]

        X = preprocessor.predict(
            corpus,
            batch_size=args.batch_size,
            use_gpu_if_available=args.use_gpu,
            buffer_size=args.buffer_size,
            threads=args.threads,
        )

        smat_util.save_matrix(args.output_inst_path, X)

        if args.output_label_path and Y is not None:
            smat_util.save_matrix(args.output_label_path, Y)
        if args.output_rel_path and R is not None:
            smat_util.save_matrix(args.output_rel_path, R)
示例#8
0
    def train(
        cls,
        input_text_path,
        output_text_path,
        label_embed_type="pifa",
        vectorizer_config=None,
        train_params=None,
        pred_params=None,
        workspace_folder=None,
        **kwargs,
    ):
        """Train a Text2Text model

        Args:

            input_text_path (str): Text input file name.
                Format: in each line, OUTPUT_ID1,OUTPUT_ID2,OUTPUT_ID3,...\t INPUT_TEXT
                where OUTPUT_IDs are the zero-based output item indices
                corresponding to the line numbers of OUTPUT_ITEM_PATH.
                We assume utf-8 encoding for text.
            output_text_path (str): The file path for output text items.
                Format: each line corresponds to a representation
                of the output item. We assume utf-8 encoding for text.
            label_embed_type (list of str): Label embedding types. (default pifa).
                We support pifa, pifa_lf_concat::Z=path, and pifa_lf_convex_combine::Z=path::alpha=scalar_value.
                Multiple values will lead to different individual models for ensembling.
            vectorizer_config_json (str): Json_format string for vectorizer config (default None)
            train_params (Text2Text.TrainParams): params to train Text2Text model
            pred_params (Text2Text.PredParams): params to predict Text2Text model
            workspace_folder: (str, default=None): A folder name for storing intermediate
                variables during training
            kwargs:
                {"beam_size": INT, "only_topk": INT, "post_processor": STR},
                    Default None to use HierarchicalMLModel.PredParams defaults

        Returns:
            A Text2Text object
        """

        ws = CachedWorkspace(workspace_folder)
        dtype = np.float32

        # Train Preprocessor and obtain X, Y
        XY_kwargs = dict(
            input_text_path=input_text_path,
            output_text_path=output_text_path,
            vectorizer_config=vectorizer_config,
            dtype=str(dtype),
        )

        # Prepare Preprocessor
        preprocessor_path = ws.get_path_for_name_and_kwargs("preprocessor", XY_kwargs)
        if path.exists(preprocessor_path):
            LOGGER.info("Loading existing preprocessor...")
            preprocessor = Preprocessor.load(preprocessor_path)
        else:
            LOGGER.info("Parsing text files...")
            parsed_result = Preprocessor.load_data_from_file(input_text_path, output_text_path)
            Y = parsed_result["label_matrix"]
            R = parsed_result["label_relevance"]
            corpus = parsed_result["corpus"]

            LOGGER.info(
                f"Training {vectorizer_config['type']} vectorizer on {len(corpus)} input texts..."
            )
            preprocessor = Preprocessor.train(corpus, vectorizer_config, dtype=dtype)
            preprocessor.save(preprocessor_path)

        # Prepare X, X could be dense or sparse
        X_path = ws.get_path_for_name_and_kwargs("X", XY_kwargs)

        if path.exists(X_path):
            X = XLinearModel.load_feature_matrix(X_path)
        else:
            if "corpus" not in locals():
                parse_result = Preprocessor.load_data_from_file(input_text_path, output_text_path)
                Y = parse_result["label_matrix"]
                R = parse_result["label_relevance"]
                corpus = parse_result["corpus"]
            LOGGER.info(f"Vectorizing {len(corpus)} texts...")
            X = preprocessor.predict(corpus)
            XLinearModel.save_feature_matrix(X_path, X)
        LOGGER.info(
            f"{vectorizer_config['type']} input X loaded: {X.shape[0]} samples with {X.shape[1]} features."
        )

        # Prepare Y, Y is always sparse
        Y_path = ws.get_path_for_name_and_kwargs("Y", XY_kwargs) + ".npz"
        if path.exists(Y_path):
            Y = smat_util.load_matrix(Y_path)
        else:
            if "Y" not in locals():
                parsed_result = Preprocessor.load_data_from_file(input_text_path, output_text_path)
                Y = parsed_result["label_matrix"]
                R = parsed_result["label_relevance"]
            smat_util.save_matrix(Y_path, Y)
        LOGGER.info(f"Output label Y loaded: {Y.shape[0]} samples with {Y.shape[1]} labels.")

        # Prepare R, R should have same sparsity pattern as Y
        R_path = ws.get_path_for_name_and_kwargs("R", XY_kwargs) + ".npz"
        if path.exists(R_path):
            R = smat_util.load_matrix(R_path)
        else:
            if "R" not in locals():
                parsed_result = Preprocessor.load_data_from_file(input_text_path, output_text_path)
                R = parsed_result["label_relevance"]
            if R is not None:
                smat_util.save_matrix(R_path, R)
        if R is not None:
            LOGGER.info(f"Relevance matrix R loaded, cost sensitive learning enabled.")

        # construct indexing, training and prediction params
        if train_params is None:
            # fill all BaseParams class with their default value
            train_params = cls.TrainParams.from_dict(dict(), recursive=True)
        else:
            train_params = cls.TrainParams.from_dict(train_params)

        # construct pred_params
        if pred_params is None:
            # fill all BaseParams with their default value
            pred_params = cls.PredParams.from_dict(dict(), recursive=True)
        else:
            pred_params = cls.PredParams.from_dict(pred_params)
        pred_params = pred_params.override_with_kwargs(kwargs)

        # 1. Generate label features
        label_embed_kwargs = dict(
            input_text_path=input_text_path,
            output_text_path=output_text_path,
            dtype=str(dtype),
            vectorizer_config=vectorizer_config,
            embed_type=label_embed_type,
        )
        label_embed_path = ws.get_path_for_name_and_kwargs("L", label_embed_kwargs)
        if path.exists(label_embed_path):
            LOGGER.info(f"Loading existing {label_embed_type} features for {Y.shape[1]} labels...")
            label_feat = XLinearModel.load_feature_matrix(label_embed_path)
        else:
            LOGGER.info(f"Generating {label_embed_type} features for {Y.shape[1]} labels...")
            # parse embed_type string, expect either the following three cases:
            # (1) pifa
            # (2) pifa_lf_concat::Z=path
            # (3) pifa_lf_convex_combine::Z=path::alpha=value
            lemb_key_val_list = label_embed_type.split("::")
            lemb_type = lemb_key_val_list[0]
            lemb_kwargs = {}
            for key_val_str in lemb_key_val_list[1:]:
                key, val = key_val_str.split("=")
                if key == "Z":
                    Z = smat_util.load_matrix(val)
                    lemb_kwargs.update({"Z": Z})
                elif key == "alpha":
                    alpha = float(val)
                    lemb_kwargs.update({"alpha": alpha})
                else:
                    raise ValueError(f"key={key}, val={val} is not supported!")
            if "lf" in lemb_type and lemb_kwargs.get("Z", None) is None:
                raise ValueError(
                    "pifa_lf_concat/pifa_lf_convex_combine must provide external path for Z."
                )
            # Create label features
            label_feat = LabelEmbeddingFactory.create(
                Y,
                X,
                method=lemb_type,
                **lemb_kwargs,
            )
            XLinearModel.save_feature_matrix(label_embed_path, label_feat)

        # 2. Indexing
        indexer_kwargs_dict = train_params.indexer_params.to_dict()
        C_path = ws.get_path_for_name_and_kwargs("C", indexer_kwargs_dict)
        if path.exists(C_path):
            LOGGER.info(f"Loading existing clustering code with params {indexer_kwargs_dict}")
            C = ClusterChain.load(C_path)
        else:
            C = Indexer.gen(label_feat, train_params=train_params.indexer_params)
            LOGGER.info("Hierarchical label tree: {}".format([cc.shape[0] for cc in C]))
            C.save(C_path)

        del label_feat
        gc.collect()

        # Ensemble Models
        m = XLinearModel.train(
            X,
            Y,
            C=C,
            R=R,
            train_params=train_params.xlinear_params,
            pred_params=pred_params.xlinear_params,
            pred_kwargs=kwargs,
        )

        xlinear_models = [[m, train_params.to_dict()]]

        # Load output items
        with open(output_text_path, "r", encoding="utf-8") as f:
            output_items = [q.strip() for q in f]

        return cls(preprocessor, xlinear_models, output_items)
示例#9
0
    def train(
        cls,
        prob,
        clustering=None,
        val_prob=None,
        train_params=None,
        pred_params=None,
        **kwargs,
    ):
        """Train the XR-Transformer model with the given input data.

        Args:
            prob (MLProblemWithText): ML problem to solve.
            clustering (ClusterChain, optional): preliminary hierarchical label tree,
                where transformer is fine-tuned on.
            val_prob (MLProblemWithText, optional): ML problem for validation.
            train_params (XTransformer.TrainParams): training parameters for XTransformer
            pred_params (XTransformer.pred_params): pred parameters for XTransformer
            kwargs:
                label_feat (ndarray or csr_matrix, optional): label features on which to generate preliminary HLT
                saved_trn_pt (str, optional): path to save the tokenized trn text. Use a tempdir if not given
                saved_val_pt (str, optional): path to save the tokenized val text. Use a tempdir if not given
                matmul_threads (int, optional): number of threads to use for
                    constructing label tree. Default to use at most 32 threads
                beam_size (int, optional): overrides only_topk for models except
                    bottom layer one

        Returns:
            XTransformer
        """
        # tempdir to save tokenized text
        temp_dir = tempfile.TemporaryDirectory()
        saved_trn_pt = kwargs.get("saved_trn_pt", "")
        if not saved_trn_pt:
            saved_trn_pt = f"{temp_dir.name}/X_trn.pt"

        saved_val_pt = kwargs.get("saved_val_pt", "")
        if not saved_val_pt:
            saved_val_pt = f"{temp_dir.name}/X_val.pt"

        # construct train_params
        if train_params is None:
            # fill all BaseParams class with their default value
            train_params = cls.TrainParams.from_dict(dict(), recursive=True)
        else:
            train_params = cls.TrainParams.from_dict(train_params)
        # construct pred_params
        if pred_params is None:
            # fill all BaseParams with their default value
            pred_params = cls.PredParams.from_dict(dict(), recursive=True)
        else:
            pred_params = cls.PredParams.from_dict(pred_params)

        if not train_params.do_fine_tune:
            if isinstance(train_params.matcher_params_chain, list):
                matcher_train_params = train_params.matcher_params_chain[-1]
            else:
                matcher_train_params = train_params.matcher_params_chain

            if isinstance(train_params.matcher_params_chain, list):
                matcher_pred_params = pred_params.matcher_params_chain[-1]
            else:
                matcher_pred_params = pred_params.matcher_params_chain

            device, n_gpu = torch_util.setup_device(matcher_train_params.use_gpu)

            if matcher_train_params.init_model_dir:
                parent_model = cls.load(train_params.init_model_dir)
                LOGGER.info("Loaded encoder from {}.".format(matcher_train_params.init_model_dir))
            else:
                parent_model = TransformerMatcher.download_model(
                    matcher_train_params.model_shortcut,
                )
                LOGGER.info(
                    "Downloaded encoder from {}.".format(matcher_train_params.model_shortcut)
                )

            parent_model.to_device(device, n_gpu=n_gpu)
            _, inst_embeddings = parent_model.predict(
                prob.X_text,
                pred_params=matcher_pred_params,
                batch_size=matcher_train_params.batch_size * max(1, n_gpu),
                batch_gen_workers=matcher_train_params.batch_gen_workers,
                only_embeddings=True,
            )
            if val_prob:
                _, val_inst_embeddings = parent_model.predict(
                    val_prob.X_text,
                    pred_params=matcher_pred_params,
                    batch_size=matcher_train_params.batch_size * max(1, n_gpu),
                    batch_gen_workers=matcher_train_params.batch_gen_workers,
                    only_embeddings=True,
                )
        else:
            # 1. Constructing primary Hierarchial Label Tree
            if clustering is None:
                label_feat = kwargs.get("label_feat", None)
                if label_feat is None:
                    if prob.X_feat is None:
                        raise ValueError(
                            "Instance features are required to generate label features!"
                        )
                    label_feat = LabelEmbeddingFactory.pifa(prob.Y, prob.X_feat)

                clustering = Indexer.gen(
                    label_feat,
                    train_params=train_params.preliminary_indexer_params,
                )
            else:
                # assert cluster chain in clustering is valid
                clustering = ClusterChain(clustering)
                if clustering[-1].shape[0] != prob.nr_labels:
                    raise ValueError("nr_labels mismatch!")
            prelim_hierarchiy = [cc.shape[0] for cc in clustering]
            LOGGER.info("Hierarchical label tree: {}".format(prelim_hierarchiy))

            # get the fine-tuning task numbers
            nr_transformers = sum(i <= train_params.max_match_clusters for i in prelim_hierarchiy)

            LOGGER.info(
                "Fine-tune Transformers with nr_labels={}".format(
                    [cc.shape[0] for cc in clustering[:nr_transformers]]
                )
            )

            steps_scale = kwargs.get("steps_scale", None)
            if steps_scale is None:
                steps_scale = [1.0] * nr_transformers
            if len(steps_scale) != nr_transformers:
                raise ValueError(f"steps-scale length error: {len(steps_scale)}!={nr_transformers}")

            # construct fields with chain now we know the depth
            train_params = HierarchicalMLModel._duplicate_fields_with_name_ending_with_chain(
                train_params, cls.TrainParams, nr_transformers
            )

            LOGGER.debug(
                f"XTransformer train_params: {json.dumps(train_params.to_dict(), indent=True)}"
            )

            pred_params = HierarchicalMLModel._duplicate_fields_with_name_ending_with_chain(
                pred_params, cls.PredParams, nr_transformers
            )
            pred_params = pred_params.override_with_kwargs(kwargs)

            LOGGER.debug(
                f"XTransformer pred_params: {json.dumps(pred_params.to_dict(), indent=True)}"
            )

            def get_negative_samples(mat_true, mat_pred, scheme):
                if scheme == "tfn":
                    result = smat_util.binarized(mat_true)
                elif scheme == "man":
                    result = smat_util.binarized(mat_pred)
                elif "tfn" in scheme and "man" in scheme:
                    result = smat_util.binarized(mat_true) + smat_util.binarized(mat_pred)
                else:
                    raise ValueError("Unrecognized negative sampling method {}".format(scheme))
                LOGGER.debug(
                    f"Construct {scheme} with shape={result.shape} avr_M_nnz={result.nnz/result.shape[0]}"
                )
                return result

            # construct label chain for training and validation set
            # avoid large matmul_threads to prevent overhead in Y.dot(C) and save memory
            matmul_threads = kwargs.get("threads", os.cpu_count())
            matmul_threads = min(32, matmul_threads)
            YC_list = [prob.Y]
            for cur_C in reversed(clustering[1:]):
                Y_t = clib.sparse_matmul(YC_list[-1], cur_C, threads=matmul_threads).tocsr()
                YC_list.append(Y_t)
            YC_list.reverse()

            if val_prob is not None:
                val_YC_list = [val_prob.Y]
                for cur_C in reversed(clustering[1:]):
                    Y_t = clib.sparse_matmul(val_YC_list[-1], cur_C, threads=matmul_threads).tocsr()
                    val_YC_list.append(Y_t)
                val_YC_list.reverse()

            parent_model = None
            M, val_M = None, None
            M_pred, val_M_pred = None, None
            bootstrapping, inst_embeddings = None, None
            for i in range(nr_transformers):
                cur_train_params = train_params.matcher_params_chain[i]
                cur_pred_params = pred_params.matcher_params_chain[i]
                cur_train_params.max_steps = steps_scale[i] * cur_train_params.max_steps
                cur_train_params.num_train_epochs = (
                    steps_scale[i] * cur_train_params.num_train_epochs
                )

                cur_ns = cur_train_params.negative_sampling

                # construct train and val problem for level i
                # note that final layer do not need X_feat
                if i > 0:
                    M = get_negative_samples(YC_list[i - 1], M_pred, cur_ns)

                cur_prob = MLProblemWithText(
                    prob.X_text,
                    YC_list[i],
                    X_feat=None if i == nr_transformers - 1 else prob.X_feat,
                    C=clustering[i],
                    M=M,
                )
                if val_prob is not None:
                    if i > 0:
                        val_M = get_negative_samples(val_YC_list[i - 1], val_M_pred, cur_ns)
                    cur_val_prob = MLProblemWithText(
                        val_prob.X_text,
                        val_YC_list[i],
                        X_feat=None if i == nr_transformers - 1 else val_prob.X_feat,
                        C=clustering[i],
                        M=val_M,
                    )
                else:
                    cur_val_prob = None

                avr_trn_labels = (
                    float(cur_prob.M.nnz) / YC_list[i].shape[0]
                    if cur_prob.M is not None
                    else YC_list[i].shape[1]
                )
                LOGGER.info(
                    "Fine-tuning XR-Transformer with {} at level {}, nr_labels={}, avr_M_nnz={}".format(
                        cur_ns, i, YC_list[i].shape[1], avr_trn_labels
                    )
                )

                # bootstrapping with previous text_encoder and instance embeddings
                if parent_model is not None:
                    init_encoder = deepcopy(parent_model.text_encoder)
                    init_text_model = deepcopy(parent_model.text_model)
                    bootstrapping = (init_encoder, inst_embeddings, init_text_model)

                # determine whether train prediction and instance embeddings are needed
                return_train_pred = (
                    i + 1 < nr_transformers
                ) and "man" in train_params.matcher_params_chain[i + 1].negative_sampling
                return_train_embeddings = (
                    i + 1 == nr_transformers
                ) or "linear" in cur_train_params.bootstrap_method

                res_dict = TransformerMatcher.train(
                    cur_prob,
                    csr_codes=M_pred,
                    val_prob=cur_val_prob,
                    val_csr_codes=val_M_pred,
                    train_params=cur_train_params,
                    pred_params=cur_pred_params,
                    bootstrapping=bootstrapping,
                    return_dict=True,
                    return_train_pred=return_train_pred,
                    return_train_embeddings=return_train_embeddings,
                    saved_trn_pt=saved_trn_pt,
                    saved_val_pt=saved_val_pt,
                )
                parent_model = res_dict["matcher"]
                M_pred = res_dict["trn_pred"]
                val_M_pred = res_dict["val_pred"]
                inst_embeddings = res_dict["trn_embeddings"]
                val_inst_embeddings = res_dict["val_embeddings"]

        if train_params.save_emb_dir:
            os.makedirs(train_params.save_emb_dir, exist_ok=True)
            if inst_embeddings is not None:
                smat_util.save_matrix(
                    os.path.join(train_params.save_emb_dir, "X.trn.npy"),
                    inst_embeddings,
                )
                LOGGER.info(f"Trn embeddings saved to {train_params.save_emb_dir}/X.trn.npy")
            if val_inst_embeddings is not None:
                smat_util.save_matrix(
                    os.path.join(train_params.save_emb_dir, "X.val.npy"),
                    val_inst_embeddings,
                )
                LOGGER.info(f"Val embeddings saved to {train_params.save_emb_dir}/X.val.npy")

        ranker = None
        if not train_params.only_encoder:
            # construct X_concat
            X_concat = TransformerMatcher.concat_features(
                prob.X_feat,
                inst_embeddings,
                normalize_emb=True,
            )
            del inst_embeddings
            LOGGER.info("Constructed instance feature matrix with shape={}".format(X_concat.shape))

            # 3. construct refined HLT
            if train_params.fix_clustering:
                clustering = clustering
            else:
                clustering = Indexer.gen(
                    LabelEmbeddingFactory.pifa(prob.Y, X_concat),
                    train_params=train_params.refined_indexer_params,
                )
            LOGGER.info(
                "Hierarchical label tree for ranker: {}".format([cc.shape[0] for cc in clustering])
            )

            # the HLT could have changed depth
            train_params.ranker_params.hlm_args = (
                HierarchicalMLModel._duplicate_fields_with_name_ending_with_chain(
                    train_params.ranker_params.hlm_args,
                    HierarchicalMLModel.TrainParams,
                    len(clustering),
                )
            )
            pred_params.ranker_params.hlm_args = (
                HierarchicalMLModel._duplicate_fields_with_name_ending_with_chain(
                    pred_params.ranker_params.hlm_args,
                    HierarchicalMLModel.PredParams,
                    len(clustering),
                )
            )
            pred_params.ranker_params.override_with_kwargs(kwargs)

            # train the ranker
            LOGGER.info("Start training ranker...")

            ranker = XLinearModel.train(
                X_concat,
                prob.Y,
                C=clustering,
                train_params=train_params.ranker_params,
                pred_params=pred_params.ranker_params,
            )

        return cls(parent_model, ranker)
示例#10
0
def do_predict(args):
    """Predict and Evaluate for xlinear model

    Args:
        args (argparse.Namespace): Command line arguments parsed by `parser.parse_args()`
    """

    # Load data
    LOGGER.info("| loading data begin...")
    start_time = time.time()
    Xt = XLinearModel.load_feature_matrix(args.inst_path)
    Xt = normalize(Xt, axis=1, norm="l2")
    run_time_data = time.time() - start_time
    LOGGER.info(
        "| loading data finsihed | time(s) {:9.4f}".format(run_time_data))

    LOGGER.info("| loading model begin...")
    start_time = time.time()
    if args.selected_output is not None:
        # Selected Output
        selected_outputs_csr = XLinearModel.load_feature_matrix(
            args.selected_output)
        xlinear_model = XLinearModel.load(args.model_folder,
                                          is_predict_only=True,
                                          weight_matrix_type="CSC")
    else:
        # TopK
        selected_outputs_csr = None
        xlinear_model = XLinearModel.load(args.model_folder,
                                          is_predict_only=True)
    run_time_io = time.time() - start_time
    LOGGER.info(
        "| loading model finsihed | time(s) {:9.4f}".format(run_time_io))

    # Model Predicting
    LOGGER.info("| inference model begin...")
    start_time = time.time()
    Yt_pred = xlinear_model.predict(
        Xt,
        selected_outputs_csr=selected_outputs_csr,
        only_topk=args.only_topk,
        beam_size=args.beam_size,
        post_processor=args.post_processor,
        threads=args.threads,
        max_pred_chunk=args.max_pred_chunk,
    )
    run_time_pred = time.time() - start_time
    LOGGER.info(
        "| inference model finsihed | time(s) {:9.4f} latency(ms/q) {:9.4f}".
        format(
            run_time_pred,
            run_time_pred / Xt.shape[0] * 1000,
        ))

    # Save prediction
    if args.save_pred_path:
        smat_util.save_matrix(args.save_pred_path, Yt_pred)

    # Evaluate
    if args.label_path:
        Yt = XLinearModel.load_label_matrix(args.label_path)
        metric = smat_util.Metrics.generate(Yt, Yt_pred, topk=10)
        print("==== evaluation results ====")
        print(metric)
示例#11
0
def main():
    parser = argparse.ArgumentParser(description='Prepare data for Giant-XRT')
    parser.add_argument(
        '--raw-text-path',
        type=str,
        required=True,
        help="Path of raw text (.txt file, each raw correspond to a node)")
    parser.add_argument(
        '--vectorizer-config-path',
        type=str,
        required=True,
        help="a path to a json file that specify the tfidf hyper-paramters")
    parser.add_argument('--data-root-dir', type=str, default="./dataset")
    parser.add_argument('--xrt-data-dir', type=str, default="./proc_data_xrt")
    parser.add_argument('--dataset', type=str, default="ogbn-arxiv")
    parser.add_argument('--max-deg', type=int, default=1000)
    args = parser.parse_args()
    print(args)

    # Change args.save_data_dir to args.save_data_dir/args.dataset
    save_data_dir = os.path.join(args.xrt_data_dir, args.dataset)
    dataset = PygNodePropPredDataset(name=args.dataset,
                                     root=args.data_root_dir)
    data = dataset[0]
    edge_index = data.edge_index

    # Make sure edge_index is undirected!!!
    if not is_undirected(edge_index):
        edge_index = to_undirected(edge_index)
    # Filtering nodes whose number of edges >= max_degree
    Degree = degree(edge_index[0])
    Filtered_idx = torch.where(Degree < args.max_deg)[0]
    print('Number of original nodes:{}'.format(data.x.shape[0]))
    print('Number of filtered nodes:{}'.format(len(Filtered_idx)))

    # # Construct and save label matrix (adjacencey matrix) Y.
    Y_csr_all = smat.csr_matrix(to_scipy_sparse_matrix(edge_index))
    Y_csr_trn = Y_csr_all[Filtered_idx]
    smat_util.save_matrix(f"{save_data_dir}/Y.trn.npz", Y_csr_trn)
    smat_util.save_matrix(f"{save_data_dir}/Y.all.npz", Y_csr_all)
    print("Saved Y.trn.npz and Y.all.npz")

    # Apply the same filtering for raw text
    with open(args.raw_text_path, "r") as fin:
        node_text_list = fin.readlines()
    print("|node_text_list={}".format(len(node_text_list)))
    count = 0
    with open(f"{save_data_dir}/X.trn.txt", "w") as fout:
        for cur_idx, line in enumerate(node_text_list):
            if Filtered_idx[count].item() == cur_idx:
                fout.writelines(line)
                count += 1
    assert count == len(Filtered_idx), "count={}, len(Filtered_idx)={}".format(
        count, len(Filtered_idx))
    print("Saved X.trn.txt")

    # Apply the same filtering for tfidf features
    vectorizer_config = Vectorizer.load_config_from_args(
        args)  # using args.vectorizer_config_path
    preprocessor = Preprocessor.train(node_text_list,
                                      vectorizer_config,
                                      dtype=np.float32)
    preprocessor.save(f"{save_data_dir}/tfidf-model")
    X_tfidf_all = preprocessor.predict(node_text_list)
    X_tfidf_trn = X_tfidf_all[Filtered_idx]
    smat_util.save_matrix(f"{save_data_dir}/X.all.tfidf.npz", X_tfidf_all)
    smat_util.save_matrix(f"{save_data_dir}/X.trn.tfidf.npz", X_tfidf_trn)
    print("Saved X.trn.npz and X.all.npz")