def test_save_and_load(tmpdir): import random import numpy as np from pecos.ann.hnsw import HNSW from pecos.utils import smat_util random.seed(1234) np.random.seed(1234) X_trn = smat_util.load_matrix( "test/tst-data/ann/X.trn.l2-normalized.npy").astype(np.float32) X_tst = smat_util.load_matrix( "test/tst-data/ann/X.tst.l2-normalized.npy").astype(np.float32) model_folder = tmpdir.join("hnsw_model_dir") train_params = HNSW.TrainParams(M=36, efC=90, metric_type="ip", threads=1) pred_params = HNSW.PredParams(efS=80, topk=10, threads=1) model = HNSW.train( X_trn, train_params=train_params, pred_params=pred_params, ) Yp_from_mem, _ = model.predict(X_tst, ret_csr=False) model.save(model_folder) del model model = HNSW.load(model_folder) Yp_from_file, _ = model.predict(X_tst, pred_params=pred_params, ret_csr=False) assert Yp_from_mem == approx( Yp_from_file, abs=0.0), f"save and load failed: Yp_from_mem != Yp_from_file"
def load(cls, path_to_cluster): """Load from disk. Args: path_to_cluster (str): Folder where `ClusterChain` was saved to using `ClusterChain.save`. Returns: ClusterChain: The loaded object. """ if os.path.isfile(path_to_cluster): C = smat_util.load_matrix(path_to_cluster) return cls.from_partial_chain(C) config_path = os.path.join(path_to_cluster, "config.json") if not os.path.exists(config_path): raise ValueError( f"Cluster config file, {config_path}, does not exist") with open(config_path, "r", encoding="utf-8") as fin: config = json.loads(fin.read()) length = config.get("len", None) if length is None: raise ValueError( f'Cluster config file, {config_path}, does not have "len" parameter' ) chain = [] for i in range(length): chain.append( smat_util.load_matrix( os.path.join(path_to_cluster, f"C{i}.npz")).tocsc().astype(np.float32)) return cls(chain)
def preprocessor_cli(tmpdir, config_path, tgt_input_file): import subprocess import shlex model_folder = str(tmpdir.join("vectorizer")) x_file = str(tmpdir.join("x")) y_file = str(tmpdir.join("y.npz")) # Build cmd = [] cmd += ["python3 -m pecos.utils.featurization.text.preprocess"] cmd += ["build"] cmd += ["-i {}".format(src_input_file)] cmd += ["--text-pos 1"] cmd += ["--vectorizer-config-path {}".format(config_path)] cmd += ["-m {}".format(model_folder)] print(" ".join(cmd)) process = subprocess.run(shlex.split(" ".join(cmd)), stdout=subprocess.PIPE, stderr=subprocess.PIPE) assert process.returncode == 0 # Run cmd = [] cmd += ["python3 -m pecos.utils.featurization.text.preprocess"] cmd += ["run"] cmd += ["-i {}".format(src_input_file)] cmd += ["-l {}".format(label_file)] cmd += ["-p {}".format(model_folder)] cmd += ["-x {}".format(x_file)] cmd += ["-y {}".format(y_file)] cmd += ["--text-pos 1"] cmd += ["--label-pos 0"] cmd += ["--threads 1"] print(" ".join(cmd)) process = subprocess.run(shlex.split(" ".join(cmd)), stdout=subprocess.PIPE, stderr=subprocess.PIPE) assert process.returncode == 0 X = smat_util.load_matrix(x_file) Xtgt = smat_util.load_matrix(tgt_input_file) assert_matrix_equal(Xtgt, X) # Run without labels cmd = [] cmd += ["python3 -m pecos.utils.featurization.text.preprocess"] cmd += ["run"] cmd += ["-i {}".format(src_input_file)] cmd += ["-p {}".format(model_folder)] cmd += ["-x {}".format(x_file)] cmd += ["--text-pos 1"] print(" ".join(cmd)) process = subprocess.run(shlex.split(" ".join(cmd)), stdout=subprocess.PIPE, stderr=subprocess.PIPE) assert process.returncode == 0 X = smat_util.load_matrix(x_file) Xtgt = smat_util.load_matrix(tgt_input_file) assert_matrix_equal(Xtgt, X)
def do_evaluation(args): """ Evaluate xlinear predictions """ assert len(args.tags) == len(args.pred_path) Y_true = sorted_csr(load_matrix(args.truth_path).tocsr()) Y_pred = [sorted_csr(load_matrix(pp).tocsr()) for pp in args.pred_path] print("==== evaluation results ====") CsrEnsembler.print_ens(Y_true, Y_pred, args.tags, ens_method=args.ens_method)
def do_evaluation(args): """Evaluate xlinear predictions Args: args (argparse.Namespace): Command line arguments parsed by `parser.parse_args()` """ Y_true = smat_util.load_matrix(args.truth_path).tocsr() Y_pred = smat_util.load_matrix(args.pred_path).tocsr() metric = smat_util.Metrics.generate(Y_true, Y_pred, topk=args.topk) print("==== evaluation results ====") print(metric)
def do_predict(args): """Predict and Evaluate for HNSW model Args: args (argparse.Namespace): Command line arguments parsed by `parser.parse_args()` """ # Load data Xt = smat_util.load_matrix(args.inst_path).astype(np.float32) # Load model model = HNSW.load(args.model_folder) # Setup HNSW Searchers for thread-safe inference threads = os.cpu_count() if args.threads <= 0 else args.threads searchers = model.searchers_create(num_searcher=threads) # Setup prediction params # pred_params.threads will be overrided if searchers are provided in model.predict() pred_params = HNSW.PredParams( efS=args.efSearch, topk=args.only_topk, threads=threads, ) # Model Predicting Yt_pred = model.predict( Xt, pred_params=pred_params, searchers=searchers, ret_csr=True, ) # Save prediction if args.save_pred_path: smat_util.save_matrix(args.save_pred_path, Yt_pred) # Evaluate Recallk@k if args.label_path: Yt = smat_util.load_matrix(args.label_path) # assuming ground truth is similarity-based (larger the better) Yt_topk = smat_util.sorted_csr(Yt, only_topk=args.only_topk) # assuming prediction matrix is distance-based, so need 1-dist=similiarty Yt_pred.data = 1.0 - Yt_pred.data metric = smat_util.Metrics.generate(Yt_topk, Yt_pred, topk=args.only_topk) print("Recall{}@{} {:.6f}%".format(args.only_topk, args.only_topk, 100.0 * metric.recall[-1]))
def test_xtransformer_python_api(): import numpy as np from pecos.utils import smat_util from pecos.utils.featurization.text.preprocess import Preprocessor from pecos.xmc.xtransformer.model import XTransformer from pecos.xmc.xtransformer.module import MLProblemWithText X_trn_file = "test/tst-data/xmc/xtransformer/train.txt" Y_trn_file = "test/tst-data/xmc/xtransformer/train_label.npz" trn_corpus = Preprocessor.load_data_from_file( X_trn_file, label_text_path=None, text_pos=0, )["corpus"] X_trn = smat_util.load_matrix(train_feat_file, dtype=np.float32) Y_trn = smat_util.load_matrix(Y_trn_file, dtype=np.float32) trn_prob = MLProblemWithText(trn_corpus, Y_trn, X_feat=X_trn) train_params = XTransformer.TrainParams.from_dict({}, recursive=True) train_params.matcher_params_chain.init_model_dir = bert_model_path train_params.matcher_params_chain.batch_size = 1 train_params.matcher_params_chain.num_train_epochs = 1 train_params.matcher_params_chain.save_steps = 2 train_params.matcher_params_chain.batch_gen_workers = 2 pred_params = XTransformer.PredParams.from_dict({}, recursive=True) pred_params.matcher_params_chain.only_topk = 2 pred_params.ranker_params.hlm_args.model_chain.only_topk = 2 print(train_params.to_dict()) print(pred_params.to_dict()) xtf = XTransformer.train( trn_prob, train_params=train_params, pred_params=pred_params, ) P = xtf.predict(trn_corpus, X_trn) metric = smat_util.Metrics.generate(Y_trn, P, topk=10) std_output = "prec = 100.00 100.00 66.67 50.00 40.00 33.33 28.57 25.00 22.22 20.00\nrecall = 41.67 83.33 83.33 83.33 83.33 83.33 83.33 83.33 83.33 83.33" assert str(metric) == std_output, f"{str(metric)} != {std_output}"
def load_label_matrix(src, for_training=False): """Load label matrix from file Args: src (str or file-like object): file to load the label matrix for_training (bool, optional): if False(default) return csr_matrix, else return csc_matrix Returns: matrix (csr_matrix or csc_matrix): loaded label matrix """ assert isinstance(src, str), "src for load_label_matrix must be a str" dtype = np.float32 feat_mat = smat_util.load_matrix(src) feat_mat = feat_mat.tocsc() if for_training else feat_mat.tocsr() return feat_mat.astype(dtype)
def load_feature_matrix(src): """Load feature matrix from file Args: src (str or file-like object): file to load the feature matrix Returns: matrix (csr_matrix or ndarray): loaded feature matrix """ feat_mat = smat_util.load_matrix(src) if isinstance(feat_mat, np.ndarray): feat_mat = np.ascontiguousarray(feat_mat) elif isinstance(feat_mat, smat.spmatrix): feat_mat = feat_mat.tocsr() feat_mat.sort_indices() return feat_mat
def do_train(args): """Train and Save HNSW model Args: args (argparse.Namespace): Command line arguments parsed by `parser.parse_args()` """ # Create model folder if not os.path.exists(args.model_folder): os.makedirs(args.model_folder) # Load training inputs X = smat_util.load_matrix(args.inst_path).astype(np.float32) # Setup training and prediction params # Note that prediction params can be overrided in inference time train_params = HNSW.TrainParams( M=args.max_edge_per_node, efC=args.efConstruction, metric_type=args.metric_type, max_level_upper_bound=args.max_level_upper_bound, threads=args.threads, ) pred_params = HNSW.PredParams( efS=args.efSearch, topk=args.only_topk, threads=args.threads, ) # train and save HNSW indexer model = HNSW.train( X, train_params=train_params, pred_params=pred_params, ) model.save(args.model_folder)
def do_predict(args): """Predict with XTransformer and save the result. Args: args (argparse.Namespace): Command line arguments parsed by `parser.parse_args()` """ if os.path.isdir(args.save_pred_path): args.save_pred_path = os.path.join(args.save_pred_path, "P.npz") torch_util.set_seed(args.seed) xtf = XTransformer.load(args.model_folder) # load instance feature and text if args.feat_path: X_feat = smat_util.load_matrix(args.feat_path) else: X_feat = None X_text = Preprocessor.load_data_from_file(args.text_path, label_text_path=None, text_pos=0)["corpus"] P_matrix = xtf.predict( X_text, X_feat=X_feat, batch_size=args.batch_size, batch_gen_workers=args.batch_gen_workers, use_gpu=args.use_gpu, beam_size=args.beam_size, only_topk=args.only_topk, post_processor=args.post_processor, max_pred_chunk=args.max_pred_chunk, threads=args.threads, ) smat_util.save_matrix(args.save_pred_path, P_matrix)
def main(): parser = argparse.ArgumentParser(description='OGBN-Products (MLP)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--use_node_embedding', action='store_true') parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.0) parser.add_argument('--lr', type=float, default=0.01) parser.add_argument('--epochs', type=int, default=300) parser.add_argument('--runs', type=int, default=10) parser.add_argument('--bn', action='store_true') parser.add_argument('--data_root_dir', type=str, default='../../dataset') parser.add_argument('--node_emb_path', type=str, default=None) args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygNodePropPredDataset(name='ogbn-products', root=args.data_root_dir) split_idx = dataset.get_idx_split() data = dataset[0] if args.node_emb_path: data.x = torch.from_numpy( smat_util.load_matrix(args.node_emb_path).astype(np.float32)) print("Loaded pre-trained node embeddings of shape={} from {}".format( data.x.shape, args.node_emb_path)) x = data.x x = x.to(device) y_true = data.y.to(device) train_idx = split_idx['train'].to(device) model = MLP(x.size(-1), args.hidden_channels, dataset.num_classes, args.num_layers, args.dropout, args.bn).to(device) evaluator = Evaluator(name='ogbn-products') logger = Logger(args.runs, args) for run in range(args.runs): model.reset_parameters() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(model, x, y_true, train_idx, optimizer) result = test(model, x, y_true, split_idx, evaluator) logger.add_result(run, result) if epoch % args.log_steps == 0: train_acc, valid_acc, test_acc = result print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Train: {100 * train_acc:.2f}%, ' f'Valid: {100 * valid_acc:.2f}%, ' f'Test: {100 * test_acc:.2f}%') logger.print_statistics(run) logger.print_statistics()
def train( cls, input_text_path, output_text_path, label_embed_type="pifa", vectorizer_config=None, train_params=None, pred_params=None, workspace_folder=None, **kwargs, ): """Train a Text2Text model Args: input_text_path (str): Text input file name. Format: in each line, OUTPUT_ID1,OUTPUT_ID2,OUTPUT_ID3,...\t INPUT_TEXT where OUTPUT_IDs are the zero-based output item indices corresponding to the line numbers of OUTPUT_ITEM_PATH. We assume utf-8 encoding for text. output_text_path (str): The file path for output text items. Format: each line corresponds to a representation of the output item. We assume utf-8 encoding for text. label_embed_type (list of str): Label embedding types. (default pifa). We support pifa, pifa_lf_concat::Z=path, and pifa_lf_convex_combine::Z=path::alpha=scalar_value. Multiple values will lead to different individual models for ensembling. vectorizer_config_json (str): Json_format string for vectorizer config (default None) train_params (Text2Text.TrainParams): params to train Text2Text model pred_params (Text2Text.PredParams): params to predict Text2Text model workspace_folder: (str, default=None): A folder name for storing intermediate variables during training kwargs: {"beam_size": INT, "only_topk": INT, "post_processor": STR}, Default None to use HierarchicalMLModel.PredParams defaults Returns: A Text2Text object """ ws = CachedWorkspace(workspace_folder) dtype = np.float32 # Train Preprocessor and obtain X, Y XY_kwargs = dict( input_text_path=input_text_path, output_text_path=output_text_path, vectorizer_config=vectorizer_config, dtype=str(dtype), ) # Prepare Preprocessor preprocessor_path = ws.get_path_for_name_and_kwargs("preprocessor", XY_kwargs) if path.exists(preprocessor_path): LOGGER.info("Loading existing preprocessor...") preprocessor = Preprocessor.load(preprocessor_path) else: LOGGER.info("Parsing text files...") parsed_result = Preprocessor.load_data_from_file(input_text_path, output_text_path) Y = parsed_result["label_matrix"] R = parsed_result["label_relevance"] corpus = parsed_result["corpus"] LOGGER.info( f"Training {vectorizer_config['type']} vectorizer on {len(corpus)} input texts..." ) preprocessor = Preprocessor.train(corpus, vectorizer_config, dtype=dtype) preprocessor.save(preprocessor_path) # Prepare X, X could be dense or sparse X_path = ws.get_path_for_name_and_kwargs("X", XY_kwargs) if path.exists(X_path): X = XLinearModel.load_feature_matrix(X_path) else: if "corpus" not in locals(): parse_result = Preprocessor.load_data_from_file(input_text_path, output_text_path) Y = parse_result["label_matrix"] R = parse_result["label_relevance"] corpus = parse_result["corpus"] LOGGER.info(f"Vectorizing {len(corpus)} texts...") X = preprocessor.predict(corpus) XLinearModel.save_feature_matrix(X_path, X) LOGGER.info( f"{vectorizer_config['type']} input X loaded: {X.shape[0]} samples with {X.shape[1]} features." ) # Prepare Y, Y is always sparse Y_path = ws.get_path_for_name_and_kwargs("Y", XY_kwargs) + ".npz" if path.exists(Y_path): Y = smat_util.load_matrix(Y_path) else: if "Y" not in locals(): parsed_result = Preprocessor.load_data_from_file(input_text_path, output_text_path) Y = parsed_result["label_matrix"] R = parsed_result["label_relevance"] smat_util.save_matrix(Y_path, Y) LOGGER.info(f"Output label Y loaded: {Y.shape[0]} samples with {Y.shape[1]} labels.") # Prepare R, R should have same sparsity pattern as Y R_path = ws.get_path_for_name_and_kwargs("R", XY_kwargs) + ".npz" if path.exists(R_path): R = smat_util.load_matrix(R_path) else: if "R" not in locals(): parsed_result = Preprocessor.load_data_from_file(input_text_path, output_text_path) R = parsed_result["label_relevance"] if R is not None: smat_util.save_matrix(R_path, R) if R is not None: LOGGER.info(f"Relevance matrix R loaded, cost sensitive learning enabled.") # construct indexing, training and prediction params if train_params is None: # fill all BaseParams class with their default value train_params = cls.TrainParams.from_dict(dict(), recursive=True) else: train_params = cls.TrainParams.from_dict(train_params) # construct pred_params if pred_params is None: # fill all BaseParams with their default value pred_params = cls.PredParams.from_dict(dict(), recursive=True) else: pred_params = cls.PredParams.from_dict(pred_params) pred_params = pred_params.override_with_kwargs(kwargs) # 1. Generate label features label_embed_kwargs = dict( input_text_path=input_text_path, output_text_path=output_text_path, dtype=str(dtype), vectorizer_config=vectorizer_config, embed_type=label_embed_type, ) label_embed_path = ws.get_path_for_name_and_kwargs("L", label_embed_kwargs) if path.exists(label_embed_path): LOGGER.info(f"Loading existing {label_embed_type} features for {Y.shape[1]} labels...") label_feat = XLinearModel.load_feature_matrix(label_embed_path) else: LOGGER.info(f"Generating {label_embed_type} features for {Y.shape[1]} labels...") # parse embed_type string, expect either the following three cases: # (1) pifa # (2) pifa_lf_concat::Z=path # (3) pifa_lf_convex_combine::Z=path::alpha=value lemb_key_val_list = label_embed_type.split("::") lemb_type = lemb_key_val_list[0] lemb_kwargs = {} for key_val_str in lemb_key_val_list[1:]: key, val = key_val_str.split("=") if key == "Z": Z = smat_util.load_matrix(val) lemb_kwargs.update({"Z": Z}) elif key == "alpha": alpha = float(val) lemb_kwargs.update({"alpha": alpha}) else: raise ValueError(f"key={key}, val={val} is not supported!") if "lf" in lemb_type and lemb_kwargs.get("Z", None) is None: raise ValueError( "pifa_lf_concat/pifa_lf_convex_combine must provide external path for Z." ) # Create label features label_feat = LabelEmbeddingFactory.create( Y, X, method=lemb_type, **lemb_kwargs, ) XLinearModel.save_feature_matrix(label_embed_path, label_feat) # 2. Indexing indexer_kwargs_dict = train_params.indexer_params.to_dict() C_path = ws.get_path_for_name_and_kwargs("C", indexer_kwargs_dict) if path.exists(C_path): LOGGER.info(f"Loading existing clustering code with params {indexer_kwargs_dict}") C = ClusterChain.load(C_path) else: C = Indexer.gen(label_feat, train_params=train_params.indexer_params) LOGGER.info("Hierarchical label tree: {}".format([cc.shape[0] for cc in C])) C.save(C_path) del label_feat gc.collect() # Ensemble Models m = XLinearModel.train( X, Y, C=C, R=R, train_params=train_params.xlinear_params, pred_params=pred_params.xlinear_params, pred_kwargs=kwargs, ) xlinear_models = [[m, train_params.to_dict()]] # Load output items with open(output_text_path, "r", encoding="utf-8") as f: output_items = [q.strip() for q in f] return cls(preprocessor, xlinear_models, output_items)
def main(): parser = argparse.ArgumentParser(description='OGBN-Products (GraphSAINT)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--inductive', action='store_true') parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--batch_size', type=int, default=20000) parser.add_argument('--walk_length', type=int, default=3) parser.add_argument('--lr', type=float, default=0.001) parser.add_argument('--num_steps', type=int, default=30) parser.add_argument('--epochs', type=int, default=50) parser.add_argument('--eval_steps', type=int, default=2) parser.add_argument('--runs', type=int, default=10) parser.add_argument('--data_root_dir', type=str, default='../../dataset') parser.add_argument('--node_emb_path', type=str, default=None) args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygNodePropPredDataset(name='ogbn-products', root=args.data_root_dir) split_idx = dataset.get_idx_split() data = dataset[0] # Load Pretrained node features from PECOS if args.node_emb_path: data.x = torch.from_numpy( smat_util.load_matrix(args.node_emb_path).astype(np.float32)) print("Loaded pre-trained node embeddings of shape={} from {}".format( data.x.shape, args.node_emb_path)) # Convert split indices to boolean masks and add them to `data`. for key, idx in split_idx.items(): mask = torch.zeros(data.num_nodes, dtype=torch.bool) mask[idx] = True data[f'{key}_mask'] = mask # We omit normalization factors here since those are only defined for the # inductive learning setup. sampler_data = data if args.inductive: sampler_data = to_inductive(data) loader = GraphSAINTRandomWalkSampler(sampler_data, batch_size=args.batch_size, walk_length=args.walk_length, num_steps=args.num_steps, sample_coverage=0, save_dir=dataset.processed_dir) model = SAGE(data.x.size(-1), args.hidden_channels, dataset.num_classes, args.num_layers, args.dropout).to(device) subgraph_loader = NeighborSampler(data.edge_index, sizes=[-1], batch_size=4096, shuffle=False, num_workers=12) evaluator = Evaluator(name='ogbn-products') logger = Logger(args.runs, args) for run in range(args.runs): model.reset_parameters() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(model, loader, optimizer, device) if epoch % args.log_steps == 0: print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}') if epoch > 9 and epoch % args.eval_steps == 0: result = test(model, data, evaluator, subgraph_loader, device) logger.add_result(run, result) train_acc, valid_acc, test_acc = result print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Train: {100 * train_acc:.2f}%, ' f'Valid: {100 * valid_acc:.2f}% ' f'Test: {100 * test_acc:.2f}%') logger.add_result(run, result) logger.print_statistics(run) logger.print_statistics()
def do_train(args): """Train and Save xr-linear model Args: args (argparse.Namespace): Command line arguments parsed by `parser.parse_args()` """ params = dict() if args.generate_params_skeleton: params["train_params"] = XLinearModel.TrainParams.from_dict( {}, recursive=True).to_dict() params["pred_params"] = XLinearModel.PredParams.from_dict( {}, recursive=True).to_dict() params["indexer_params"] = HierarchicalKMeans.TrainParams.from_dict( {}, recursive=True).to_dict() print(f"{json.dumps(params, indent=True)}") return if args.params_path: with open(args.params_path, "r") as fin: params = json.load(fin) train_params = params.get("train_params", None) pred_params = params.get("pred_params", None) indexer_params = params.get("indexer_params", None) if train_params is not None: train_params = XLinearModel.TrainParams.from_dict(train_params) else: train_params = XLinearModel.TrainParams.from_dict( {k: v for k, v in vars(args).items() if v is not None}, recursive=True, ) if pred_params is not None: pred_params = XLinearModel.PredParams.from_dict(pred_params) else: pred_params = XLinearModel.PredParams.from_dict( {k: v for k, v in vars(args).items() if v is not None}, recursive=True, ) if indexer_params is not None: indexer_params = HierarchicalKMeans.TrainParams.from_dict( indexer_params) else: indexer_params = HierarchicalKMeans.TrainParams.from_dict( {k: v for k, v in vars(args).items() if v is not None}, recursive=True, ) if args.seed: indexer_params.seed = args.seed if not os.path.exists(args.model_folder): os.makedirs(args.model_folder) LOGGER.info("| loading data begin...") start_time = time.time() X = XLinearModel.load_feature_matrix(args.inst_path) X = normalize(X, axis=1, norm="l2") Y = XLinearModel.load_label_matrix(args.label_path, for_training=True) run_time_io = time.time() - start_time LOGGER.info( "| loading data finsihed | time(s) {:9.4f}".format(run_time_io)) LOGGER.info("| building HLT...") start_time = time.time() if args.code_path: cluster_chain = ClusterChain.load(args.code_path) else: if args.label_feat_path: label_feat = XLinearModel.load_feature_matrix(args.label_feat_path) else: label_feat = LabelEmbeddingFactory.create(Y, X, method="pifa") cluster_chain = Indexer.gen(label_feat, train_params=indexer_params) run_time_hlt = time.time() - start_time LOGGER.info( "| building HLT finsihed | time(s) {:9.4f}".format(run_time_hlt)) # load label importance matrix if given if args.usn_label_path: usn_label_mat = smat_util.load_matrix(args.usn_label_path) else: usn_label_mat = None # load user supplied matching matrix if given if args.usn_match_path: usn_match_mat = smat_util.load_matrix(args.usn_match_path) else: usn_match_mat = None usn_match_dict = {0: usn_label_mat, 1: usn_match_mat} # load relevance matrix for cost-sensitive learning if args.rel_path: R = smat_util.load_matrix(args.rel_path) else: R = None pred_kwargs = {} for kw in ["beam_size", "only_topk", "post_processor"]: if getattr(args, kw, None) is not None: pred_kwargs[kw] = getattr(args, kw) LOGGER.info("| training XR-Linear...") start_time = time.time() xlm = XLinearModel.train( X, Y, C=cluster_chain, R=R, user_supplied_negatives=usn_match_dict, train_params=train_params, pred_params=pred_params, pred_kwargs=pred_kwargs, ) run_time_xrl = time.time() - start_time LOGGER.info( "| training XR_Linear finsihed | time(s) {:9.4f}".format(run_time_xrl)) xlm.save(args.model_folder) LOGGER.info( "| Finished with run_time(s) | total {:9.4f} hlt {:9.4f} xrl {:9.4f}". format( run_time_hlt + run_time_xrl, run_time_hlt, run_time_xrl, ))
def main(): parser = argparse.ArgumentParser(description='OGBN-papers100M (MLP)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0) parser.add_argument('--lr', type=float, default=0.01) parser.add_argument('--batch_size', type=int, default=256) parser.add_argument('--epochs', type=int, default=30) parser.add_argument('--runs', type=int, default=5) parser.add_argument('--data_root_dir', type=str, default='../../dataset') parser.add_argument('--node_emb_path', type=str, default=None) args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygNodePropPredDataset(name='ogbn-papers100M',root=args.data_root_dir) split_idx = dataset.get_idx_split() data = dataset[0] if args.node_emb_path: data.x = torch.from_numpy(smat_util.load_matrix(args.node_emb_path).astype(np.float32)) print("Loaded pre-trained node embeddings of shape={} from {}".format(data.x.shape, args.node_emb_path)) x = data.x y = data.y.to(torch.long) train_dataset = SimpleDataset(x[split_idx['train']], y[split_idx['train']]) valid_dataset = SimpleDataset(x[split_idx['valid']], y[split_idx['valid']]) test_dataset = SimpleDataset(x[split_idx['test']], y[split_idx['test']]) train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) valid_loader = DataLoader(valid_dataset, batch_size=args.batch_size * 4, shuffle=False) test_loader = DataLoader(test_dataset, batch_size=args.batch_size * 4, shuffle=False) model = MLP(x.size(-1), args.hidden_channels, dataset.num_classes, args.num_layers, args.dropout).to(device) evaluator = Evaluator(name='ogbn-papers100M') logger = Logger(args.runs, args) for run in range(args.runs): model.reset_parameters() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) for epoch in range(1, 1 + args.epochs): train(model, device, train_loader, optimizer) train_acc = test(model, device, train_loader, evaluator) valid_acc = test(model, device, valid_loader, evaluator) test_acc = test(model, device, test_loader, evaluator) logger.add_result(run, (train_acc, valid_acc, test_acc)) if epoch % args.log_steps == 0: print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Train: {100 * train_acc:.2f}%, ' f'Valid: {100 * valid_acc:.2f}%, ' f'Test: {100 * test_acc:.2f}%') logger.print_statistics(run) logger.print_statistics()
def do_train(args): """Train and Save xlinear model Args: args (argparse.Namespace): Command line arguments parsed by `parser.parse_args()` """ params = dict() if args.generate_params_skeleton: params["train_params"] = XLinearModel.TrainParams.from_dict( {}, recursive=True).to_dict() params["pred_params"] = XLinearModel.PredParams.from_dict( {}, recursive=True).to_dict() params["indexer_params"] = HierarchicalKMeans.TrainParams.from_dict( {}, recursive=True).to_dict() print(f"{json.dumps(params, indent=True)}") return if args.params_path: with open(args.params_path, "r") as fin: params = json.load(fin) train_params = params.get("train_params", None) pred_params = params.get("pred_params", None) indexer_params = params.get("indexer_params", None) if train_params is not None: train_params = XLinearModel.TrainParams.from_dict(train_params) else: train_params = XLinearModel.TrainParams.from_dict( {k: v for k, v in vars(args).items() if v is not None}, recursive=True, ) if pred_params is not None: pred_params = XLinearModel.PredParams.from_dict(pred_params) else: pred_params = XLinearModel.PredParams.from_dict( {k: v for k, v in vars(args).items() if v is not None}, recursive=True, ) if indexer_params is not None: indexer_params = HierarchicalKMeans.TrainParams.from_dict( indexer_params) else: indexer_params = HierarchicalKMeans.TrainParams.from_dict( {k: v for k, v in vars(args).items() if v is not None}, recursive=True, ) # Create model folder if not os.path.exists(args.model_folder): os.makedirs(args.model_folder) # Load training inputs and labels X = XLinearModel.load_feature_matrix(args.inst_path) Y = XLinearModel.load_label_matrix(args.label_path, for_training=True) if args.code_path: cluster_chain = ClusterChain.load(args.code_path) else: if args.label_feat_path: label_feat = XLinearModel.load_feature_matrix(args.label_feat_path) else: label_feat = LabelEmbeddingFactory.create(Y, X, method="pifa") cluster_chain = Indexer.gen(label_feat, train_params=indexer_params) # load label importance matrix if given if args.usn_label_path: usn_label_mat = smat_util.load_matrix(args.usn_label_path) else: usn_label_mat = None # load user supplied matching matrix if given if args.usn_match_path: usn_match_mat = smat_util.load_matrix(args.usn_match_path) else: usn_match_mat = None usn_match_dict = {0: usn_label_mat, 1: usn_match_mat} # load relevance matrix for cost-sensitive learning if args.rel_path: R = smat_util.load_matrix(args.rel_path) else: R = None pred_kwargs = {} for kw in ["beam_size", "only_topk", "post_processor"]: if getattr(args, kw, None) is not None: pred_kwargs[kw] = getattr(args, kw) xlm = XLinearModel.train( X, Y, C=cluster_chain, R=R, user_supplied_negatives=usn_match_dict, train_params=train_params, pred_params=pred_params, pred_kwargs=pred_kwargs, ) xlm.save(args.model_folder)
def do_train(args): """Train and save XR-Transformer model. Args: args (argparse.Namespace): Command line arguments parsed by `parser.parse_args()` """ params = dict() if args.generate_params_skeleton: params["train_params"] = XTransformer.TrainParams.from_dict( {}, recursive=True).to_dict() params["pred_params"] = XTransformer.PredParams.from_dict( {}, recursive=True).to_dict() print(f"{json.dumps(params, indent=True)}") return if args.params_path: with open(args.params_path, "r") as fin: params = json.load(fin) train_params = params.get("train_params", None) pred_params = params.get("pred_params", None) if train_params is not None: train_params = XTransformer.TrainParams.from_dict(train_params) else: train_params = XTransformer.TrainParams.from_dict( {k: v for k, v in vars(args).items() if v is not None}, recursive=True, ) if pred_params is not None: pred_params = XTransformer.PredParams.from_dict(pred_params) else: pred_params = XTransformer.PredParams.from_dict( {k: v for k, v in vars(args).items() if v is not None}, recursive=True, ) torch_util.set_seed(args.seed) LOGGER.info("Setting random seed {}".format(args.seed)) # Load training feature if args.trn_feat_path: X_trn = smat_util.load_matrix(args.trn_feat_path, dtype=np.float32) LOGGER.info("Loaded training feature matrix with shape={}".format( X_trn.shape)) else: X_trn = None LOGGER.info("Training feature matrix not provided") if not args.label_feat_path and not args.code_path: raise ValueError( "trn-feat is required unless code-path or label-feat is provided." ) # Load training labels Y_trn = smat_util.load_matrix(args.trn_label_path, dtype=np.float32) LOGGER.info("Loaded training label matrix with shape={}".format( Y_trn.shape)) # Load test feature if given if args.tst_feat_path: X_tst = smat_util.load_matrix(args.tst_feat_path, dtype=np.float32) LOGGER.info("Loaded test feature matrix with shape={}".format( X_tst.shape)) else: X_tst = None # Load test labels if given if args.tst_label_path: Y_tst = smat_util.load_matrix(args.tst_label_path, dtype=np.float32) LOGGER.info("Loaded test label matrix with shape={}".format( Y_tst.shape)) else: Y_tst = None # Load training texts trn_corpus = Preprocessor.load_data_from_file( args.trn_text_path, label_text_path=None, text_pos=0, )["corpus"] LOGGER.info("Loaded {} training sequences".format(len(trn_corpus))) # Load test text if given if args.tst_text_path: tst_corpus = Preprocessor.load_data_from_file( args.tst_text_path, label_text_path=None, text_pos=0, )["corpus"] LOGGER.info("Loaded {} test sequences".format(len(tst_corpus))) else: tst_corpus = None # load cluster chain or label features cluster_chain, label_feat = None, None if os.path.exists(args.code_path): cluster_chain = ClusterChain.from_partial_chain( smat_util.load_matrix(args.code_path), min_codes=args.min_codes, nr_splits=args.nr_splits, ) LOGGER.info("Loaded from code-path: {}".format(args.code_path)) else: if os.path.isfile(args.label_feat_path): label_feat = smat_util.load_matrix(args.label_feat_path, dtype=np.float32) LOGGER.info("Loaded label feature matrix shape={}, from {}".format( label_feat.shape, args.label_feat_path)) trn_prob = MLProblemWithText(trn_corpus, Y_trn, X_feat=X_trn) if all(v is not None for v in [tst_corpus, Y_tst]): val_prob = MLProblemWithText(tst_corpus, Y_tst, X_feat=X_tst) else: val_prob = None xtf = XTransformer.train( trn_prob, clustering=cluster_chain, val_prob=val_prob, train_params=train_params, pred_params=pred_params, beam_size=args.beam_size, steps_scale=args.steps_scale, label_feat=label_feat, ) xtf.save(args.model_dir)
def test_predict_and_recall(): import random import numpy as np import scipy.sparse as smat from pecos.utils import smat_util from pecos.ann.hnsw import HNSW random.seed(1234) np.random.seed(1234) top_k = 10 efS_list = [50, 75, 100] num_searcher_online = 2 def calc_recall(Y_true, Y_pred): n_data, top_k = Y_true.shape recall = 0.0 for qid in range(n_data): yt = set(Y_true[qid, :].flatten().data) yp = set(Y_pred[qid, :].flatten().data) recall += len(yt.intersection(yp)) / top_k recall = recall / n_data return recall # load data matrices X_trn = smat_util.load_matrix( "test/tst-data/ann/X.trn.l2-normalized.npy").astype(np.float32) X_tst = smat_util.load_matrix( "test/tst-data/ann/X.tst.l2-normalized.npy").astype(np.float32) dense_model_folder = "test/tst-data/ann/hnsw-model-dense" sparse_model_folder = "test/tst-data/ann/hnsw-model-sparse" # compute exact NN ground truth # for both ip and cosine similarity, since data is l2-normalized Y_true = 1.0 - X_tst.dot(X_trn.T) Y_true = np.argsort(Y_true)[:, :top_k] # test dense features model = HNSW.load(dense_model_folder) searchers = model.searchers_create(num_searcher_online) pred_params = model.get_pred_params() for efS in efS_list: pred_params.efS = efS Y_pred, _ = model.predict(X_tst, pred_params=pred_params, searchers=searchers, ret_csr=False) recall = calc_recall(Y_true, Y_pred) assert recall == approx( 1.0, abs=1e-2 ), f"hnsw inference failed: data_type=drm, efS={efS}, recall={recall}" del searchers, model # test csr features, we just reuse the Y_true since data are the same X_trn = smat.csr_matrix(X_trn).astype(np.float32) X_tst = smat.csr_matrix(X_tst).astype(np.float32) model = HNSW.load(sparse_model_folder) searchers = model.searchers_create(num_searcher_online) pred_params = model.get_pred_params() for efS in efS_list: pred_params.efS = efS Y_pred, _ = model.predict(X_tst, pred_params=pred_params, searchers=searchers, ret_csr=False) recall = calc_recall(Y_true, Y_pred) assert recall == approx( 1.0, abs=1e-2 ), f"hnsw inference failed: data_type=csr, efS={efS}, recall={recall}" del searchers, model
def do_spmm_exp(args): # load data Y = smat_util.load_matrix(args.y_npz_path).astype(np.float32) X = smat_util.load_matrix(args.x_npz_path).astype(np.float32) YT_csr = Y.T.tocsr() X_csr = X.tocsr() # The #threads is control by env variables (except for pecos) # e.g., export OMP_NUM_THREADS=16, export MKL_NUM_THREADS=16. run_time = 0.0 if args.spmm_algo == "pecos": start = time.time() Z = pecos_clib.sparse_matmul( YT_csr, X_csr, eliminate_zeros=False, sorted_indices=True, threads=args.threads, ) run_time += time.time() - start Z_data = Z.data elif args.spmm_algo == "intel-mkl": from sparse_dot_mkl import dot_product_mkl # make sure set the index to int64 for large matrices # export MKL_INTERFACE_LAYER=ILP64 start = time.time() Z = dot_product_mkl(YT_csr, X_csr, reorder_output=True) run_time += time.time() - start Z_data = Z.data elif args.spmm_algo == "scipy": # scipy will not sorted the indices for each row, # so we do it explicitly start = time.time() Z = YT_csr.dot(X_csr) Z.sort_indices() run_time += time.time() - start Z_data = Z.data elif args.spmm_algo == "pytorch": import torch def get_pt_data(A_csr): A_indices, A_values = csr_to_coo(A_csr) A_pt = torch.sparse_coo_tensor( A_indices.T.astype(np.int64), A_values.astype(np.float32), A_csr.shape, ) return A_pt YT_pt = get_pt_data(YT_csr) X_pt = get_pt_data(X_csr) start = time.time() Z_pt = torch.sparse.mm(YT_pt, X_pt) run_time += time.time() - start Z_data = Z_pt.coalesce().values().numpy() elif args.spmm_algo == "tensorflow": import tensorflow.compat.v1 as tf from tensorflow.python.ops.linalg.sparse import sparse_csr_matrix_ops def get_tf_data(A_csr): # Define (COO format) Sparse Tensors over Numpy arrays A_indices, A_values = csr_to_coo(A_csr) A_st = tf.sparse.SparseTensor( A_indices.astype(np.int64), A_values.astype(np.float32), A_csr.shape, ) return A_st # Tensorflow (v2.5.0) usage, as of 07/20/2021: # https://www.tensorflow.org/api_docs/python/tf/raw_ops/SparseMatrixSparseMatMul with tf.Session() as sess: YT_st = get_tf_data(YT_csr) X_st = get_tf_data(X_csr) sess.run(YT_st) sess.run(X_st) YT_sm = sparse_csr_matrix_ops.sparse_tensor_to_csr_sparse_matrix( YT_st.indices, YT_st.values, YT_st.dense_shape) X_sm = sparse_csr_matrix_ops.sparse_tensor_to_csr_sparse_matrix( X_st.indices, X_st.values, X_st.dense_shape) start = time.time() Z_sm = sparse_csr_matrix_ops.sparse_matrix_sparse_mat_mul( a=YT_sm, b=X_sm, type=tf.float32) Z_st = sparse_csr_matrix_ops.csr_sparse_matrix_to_sparse_tensor( Z_sm, tf.float32) Z_data = sess.run(Z_st.values) run_time += time.time() - start else: raise ValueError(f"spmm_algo={args.spmm_algo} is not valid") run_time = time.time() - start print( "algo {:16s} time(s) {:9.5f} nnz(Z) {:12d} mu(Z.data) {:8.4f}".format( args.spmm_algo, run_time, len(Z_data), np.mean(Z_data), ))