def transformer_eval( data_cnf, model_cnf, data_name, model_name, model_path, tree_id, output_suffix, dry_run, ): logger.info("Loading Test Set") mlb = get_mlb(data_cnf["labels_binarizer"]) num_labels = len(mlb.classes_) test_x, _ = get_data(data_cnf["test"]["texts"], None) test_atten_mask = test_x["attention_mask"] test_x = test_x["input_ids"] logger.info(f"Size of Test Set: {len(test_x):,}") logger.info("Predicting") test_loader = DataLoader( MultiLabelDataset(test_x, attention_mask=test_atten_mask), model_cnf["predict"]["batch_size"], num_workers=4, ) model_cls = MODEL_TYPE[model_cnf["model"]["base"]] network = model_cls.from_pretrained(model_cnf["model"]["pretrained"], num_labels=num_labels) model_cnf['model'].pop('load_model', None) model = TransformerXML(network, model_path, load_model=True, **data_cnf["model"], **model_cnf["model"]) scores, labels = model.predict(test_loader, k=model_cnf["predict"].get("k", 100)) labels = mlb.classes_[labels] logger.info("Finish Predicting") score_path, label_path = output_res( data_cnf["output"]["res"], f"{model_name}-{data_name}{tree_id}", scores, labels, output_suffix, ) log_results(score_path, label_path, dry_run)
def random_forest_eval( data_cnf, model_cnf, data_name, model_name, model_path, emb_init, tree_id, output_suffix, dry_run, num_tree, ): mlb_list = [] logger.info('Loading Test Set') mlb = get_mlb(data_cnf['labels_binarizer']) labels_num = len(mlb.classes_) test_x, _ = get_data(data_cnf['test']['texts'], None) logger.info(F'Size of Test Set: {len(test_x):,}') logger.info('Predicting') if 'cluster' not in model_cnf: raise Exception("AttentionXML is not currently supported random forest mode") else: labels_binarizer_path = data_cnf['labels_binarizer'] for i in range(num_tree): filename = f"{labels_binarizer_path}_RF_{i}" mlb_tree = get_mlb(filename) mlb_list.append(mlb_tree) scores_list = [] labels_list = [] for i, mlb in enumerate(mlb_list): logger.info(f"Predicting RF {i}") model = FastAttentionXML( len(mlb.classes_), data_cnf, model_cnf, tree_id, f"{output_suffix}-{i}") scores, labels = model.predict(test_x, model_cnf['predict'].get('rf_k', 100 // num_tree)) scores_list.append(scores) labels_list.append(mlb.classes_[labels]) logger.info(f"Finish Prediting RF {i}") scores = np.hstack(scores_list) labels = np.hstack(labels_list) i = np.arange(len(scores))[:, None] j = np.argsort(scores)[:, ::-1] scores = scores[i, j] labels = labels[i, j] logger.info('Finish Predicting') score_path, label_path = output_res(data_cnf['output']['res'], f'{model_name}-{data_name}{tree_id}', scores, labels, output_suffix) log_results(score_path, label_path, dry_run)
def default_eval( data_cnf, model_cnf, data_name, model_name, model_path, emb_init, tree_id, output_suffix, dry_run, ): logger.info('Loading Test Set') mlb = get_mlb(data_cnf['labels_binarizer']) labels_num = len(mlb.classes_) test_x, _ = get_data(data_cnf['test']['texts'], None) logger.info(F'Size of Test Set: {len(test_x):,}') logger.info('Predicting') model_cnf['model'].pop('load_model', None) if 'cluster' not in model_cnf: test_loader = DataLoader( MultiLabelDataset(test_x), model_cnf['predict']['batch_size'], num_workers=4) if 'loss' in model_cnf: gamma = model_cnf['loss'].get('gamma', 1.0) loss_name = model_cnf['loss']['name'] else: gamma = None loss_name = 'bce' model = Model( network=AttentionRNN, labels_num=labels_num, model_path=model_path, emb_init=emb_init, load_model=True, loss_name=loss_name, gamma=gamma, **data_cnf['model'], **model_cnf['model']) scores, labels = model.predict(test_loader, k=model_cnf['predict'].get('k', 100)) labels = mlb.classes_[labels] else: model = FastAttentionXML(labels_num, data_cnf, model_cnf, tree_id, output_suffix) scores, labels = model.predict(test_x, model_cnf['predict'].get('k', 100)) labels = mlb.classes_[labels] logger.info('Finish Predicting') score_path, label_path = output_res(data_cnf['output']['res'], f'{model_name}-{data_name}{tree_id}', scores, labels, output_suffix) log_results(score_path, label_path, dry_run)
def main(data_cnf, model_cnf, mode, reg): yaml = YAML(typ='safe') data_cnf, model_cnf = yaml.load(Path(data_cnf)), yaml.load(Path(model_cnf)) model, model_name, data_name = None, model_cnf['name'], data_cnf['name'] model_path = os.path.join(model_cnf['path'], F'{model_name}-{data_name}') emb_init = get_word_emb(data_cnf['embedding']['emb_init']) logger.info(F'Model Name: {model_name}') if mode is None or mode == 'train': logger.info('Loading Training and Validation Set') train_x, train_labels = get_data(data_cnf['train']['texts'], data_cnf['train']['labels']) if 'size' in data_cnf['valid']: random_state = data_cnf['valid'].get('random_state', 1240) train_x, valid_x, train_labels, valid_labels = train_test_split( train_x, train_labels, test_size=data_cnf['valid']['size'], random_state=random_state) else: valid_x, valid_labels = get_data(data_cnf['valid']['texts'], data_cnf['valid']['labels']) mlb = get_mlb(data_cnf['labels_binarizer'], np.hstack((train_labels, valid_labels))) train_y, valid_y = mlb.transform(train_labels), mlb.transform( valid_labels) labels_num = len(mlb.classes_) logger.info(F'Number of Labels: {labels_num}') logger.info(F'Size of Training Set: {len(train_x)}') logger.info(F'Size of Validation Set: {len(valid_x)}') edges = set() if reg: classes = mlb.classes_.tolist() with open(data_cnf['hierarchy']) as fin: for line in fin: data = line.strip().split() p = data[0] if p not in classes: continue p_id = classes.index(p) for c in data[1:]: if c not in classes: continue c_id = classes.index(c) edges.add((p_id, c_id)) logger.info(F'Number of Edges: {len(edges)}') logger.info('Training') train_loader = DataLoader(MultiLabelDataset(train_x, train_y), model_cnf['train']['batch_size'], shuffle=True, num_workers=4) valid_loader = DataLoader(MultiLabelDataset(valid_x, valid_y, training=True), model_cnf['valid']['batch_size'], num_workers=4) model = Model(network=MATCH, labels_num=labels_num, model_path=model_path, emb_init=emb_init, mode='train', reg=reg, hierarchy=edges, **data_cnf['model'], **model_cnf['model']) opt_params = { 'lr': model_cnf['train']['learning_rate'], 'betas': (model_cnf['train']['beta1'], model_cnf['train']['beta2']), 'weight_decay': model_cnf['train']['weight_decay'] } model.train(train_loader, valid_loader, opt_params=opt_params, **model_cnf['train']) # CHANGE: inserted opt_params logger.info('Finish Training') if mode is None or mode == 'eval': logger.info('Loading Test Set') mlb = get_mlb(data_cnf['labels_binarizer']) labels_num = len(mlb.classes_) test_x, _ = get_data(data_cnf['test']['texts'], None) logger.info(F'Size of Test Set: {len(test_x)}') logger.info('Predicting') test_loader = DataLoader(MultiLabelDataset(test_x), model_cnf['predict']['batch_size'], num_workers=4) if model is None: model = Model(network=MATCH, labels_num=labels_num, model_path=model_path, emb_init=emb_init, mode='eval', **data_cnf['model'], **model_cnf['model']) scores, labels = model.predict(test_loader, k=model_cnf['predict'].get('k', 100)) logger.info('Finish Predicting') labels = mlb.classes_[labels] output_res(data_cnf['output']['res'], F'{model_name}-{data_name}', scores, labels)
def main(data_cnf, model_cnf, mode, tree_id): tree_id = F'-Tree-{tree_id}' if tree_id is not None else '' yaml = YAML(typ='safe') data_cnf, model_cnf = yaml.load(Path(data_cnf)), yaml.load(Path(model_cnf)) model, model_name, data_name = None, model_cnf['name'], data_cnf['name'] model_path = os.path.join(model_cnf['path'], F'{model_name}-{data_name}{tree_id}') emb_init = get_word_emb(data_cnf['embedding']['emb_init']) logger.info(F'Model Name: {model_name}') if mode is None or mode == 'train': logger.info('Loading Training and Validation Set') train_x, train_labels = get_data(data_cnf['train']['texts'], data_cnf['train']['labels']) if 'size' in data_cnf['valid']: random_state = data_cnf['valid'].get('random_state', 1240) train_x, valid_x, train_labels, valid_labels = train_test_split(train_x, train_labels, test_size=data_cnf['valid']['size'], random_state=random_state) else: valid_x, valid_labels = get_data(data_cnf['valid']['texts'], data_cnf['valid']['labels']) mlb = get_mlb(data_cnf['labels_binarizer'], np.hstack((train_labels, valid_labels))) train_y, valid_y = mlb.transform(train_labels), mlb.transform(valid_labels) labels_num = len(mlb.classes_) logger.info(F'Number of Labels: {labels_num}') logger.info(F'Size of Training Set: {len(train_x)}') logger.info(F'Size of Validation Set: {len(valid_x)}') logger.info('Training') if 'cluster' not in model_cnf: train_loader = DataLoader(MultiLabelDataset(train_x, train_y), model_cnf['train']['batch_size'], shuffle=True, num_workers=4) valid_loader = DataLoader(MultiLabelDataset(valid_x, valid_y, training=False), model_cnf['valid']['batch_size'], num_workers=4) model = Model(network=AttentionRNN, labels_num=labels_num, model_path=model_path, emb_init=emb_init, **data_cnf['model'], **model_cnf['model']) model.train(train_loader, valid_loader, **model_cnf['train']) else: model = FastAttentionXML(labels_num, data_cnf, model_cnf, tree_id) model.train(train_x, train_y, valid_x, valid_y, mlb) logger.info('Finish Training') if mode is None or mode == 'eval': logger.info('Loading Test Set') mlb = get_mlb(data_cnf['labels_binarizer']) labels_num = len(mlb.classes_) test_x, _ = get_data(data_cnf['test']['texts'], None) logger.info(F'Size of Test Set: {len(test_x)}') logger.info('Predicting') if 'cluster' not in model_cnf: test_loader = DataLoader(MultiLabelDataset(test_x), model_cnf['predict']['batch_size'], num_workers=4) if model is None: model = Model(network=AttentionRNN, labels_num=labels_num, model_path=model_path, emb_init=emb_init, **data_cnf['model'], **model_cnf['model']) scores, labels = model.predict(test_loader, k=model_cnf['predict'].get('k', 100)) else: if model is None: model = FastAttentionXML(labels_num, data_cnf, model_cnf, tree_id) scores, labels = model.predict(test_x) logger.info('Finish Predicting') labels = mlb.classes_[labels] output_res(data_cnf['output']['res'], F'{model_name}-{data_name}{tree_id}', scores, labels)
def spectral_clustering_eval( data_cnf, model_cnf, data_name, model_name, model_path, emb_init, tree_id, output_suffix, dry_run, ): mlb_list = [] n_clusters = model_cnf['spectral_clustering']['num_clusters'] labels_binarizer_path = data_cnf['labels_binarizer'] scores_list = [] labels_list = [] logger.info('Loading Test Set') test_x, _ = get_data(data_cnf['test']['texts'], None) logger.info(F'Size of Test Set: {len(test_x):,}') logger.info('Predicting') if 'cluster' not in model_cnf: suffix = output_suffix.upper().replace('-', '_') for i in range(n_clusters): filename = f"{labels_binarizer_path}_{suffix}_{i}" mlb_tree = get_mlb(filename) mlb_list.append(mlb_tree) test_loader = DataLoader( MultiLabelDataset(test_x), model_cnf['predict']['batch_size'], num_workers=4) for i, mlb in enumerate(mlb_list): logger.info(f"Predicting Cluster {i}") labels_num = len(mlb.classes_) k = model_cnf['predict'].get('k', 100) // n_clusters model = Model( network=AttentionRNN, labels_num=labels_num, model_path=f'{model_path}-{i}', emb_init=emb_init, load_model=True, **data_cnf['model'], **model_cnf['model']) scores, labels = model.predict(test_loader, k=k) scores_list.append(scores) labels_list.append(mlb.classes_[labels]) logger.info(f"Finish Prediting Cluster {i}") scores = np.hstack(scores_list) labels = np.hstack(labels_list) i = np.arange(len(scores))[:, None] j = np.argsort(scores)[:, ::-1] scores = scores[i, j] labels = labels[i, j] else: mlb = get_mlb(data_cnf['labels_binarizer']) model = FastAttentionXML(len(mlb.classes_), data_cnf, model_cnf, tree_id, output_suffix) scores, labels = model.predict(test_x, model_cnf['predict'].get('k', 100)) labels = mlb.classes_[labels] logger.info('Finish Predicting') score_path, label_path = output_res(data_cnf['output']['res'], f'{model_name}-{data_name}{tree_id}', scores, labels, output_suffix) log_results(score_path, label_path, dry_run)
def main(data_cnf, model_cnf, mode): model_name = os.path.split(model_cnf)[1].split(".")[0] yaml = YAML(typ='safe') data_cnf, model_cnf = yaml.load(Path(data_cnf)), yaml.load(Path(model_cnf)) # 設定log檔案位置 logfile("./logs/logfile_{0}_cornet_{1}_cornet_dim_{2}.log".format( model_name, model_cnf['model']['n_cornet_blocks'], model_cnf['model']['cornet_dim'])) model, model_name, data_name = None, model_cnf['name'], data_cnf['name'] model_path = os.path.join( model_cnf['path'], F'{model_name}-{data_name}-{model_cnf["model"]["n_cornet_blocks"]}-{model_cnf["model"]["cornet_dim"]}' ) emb_init = get_word_emb(data_cnf['embedding']['emb_init']) logger.info(F'Model Name: {model_name}') # summary(model_dict[model_name]) if mode is None or mode == 'train': logger.info('Loading Training and Validation Set') train_x, train_labels = get_data(data_cnf['train']['texts'], data_cnf['train']['labels']) if 'size' in data_cnf['valid']: random_state = data_cnf['valid'].get('random_state', 1240) train_x, valid_x, train_labels, valid_labels = train_test_split( train_x, train_labels, test_size=data_cnf['valid']['size'], random_state=random_state) else: valid_x, valid_labels = get_data(data_cnf['valid']['texts'], data_cnf['valid']['labels']) mlb = get_mlb(data_cnf['labels_binarizer'], np.hstack((train_labels, valid_labels))) train_y, valid_y = mlb.transform(train_labels), mlb.transform( valid_labels) labels_num = len(mlb.classes_) logger.info(F'Number of Labels: {labels_num}') logger.info(F'Size of Training Set: {len(train_x)}') logger.info(F'Size of Validation Set: {len(valid_x)}') logger.info('Training') train_loader = DataLoader(MultiLabelDataset(train_x, train_y), model_cnf['train']['batch_size'], shuffle=True, num_workers=4) valid_loader = DataLoader(MultiLabelDataset(valid_x, valid_y, training=True), model_cnf['valid']['batch_size'], num_workers=4) if 'gpipe' not in model_cnf: model = Model(network=model_dict[model_name], labels_num=labels_num, model_path=model_path, emb_init=emb_init, **data_cnf['model'], **model_cnf['model']) else: model = GPipeModel(model_name, labels_num=labels_num, model_path=model_path, emb_init=emb_init, **data_cnf['model'], **model_cnf['model']) loss, p1, p5 = model.train(train_loader, valid_loader, **model_cnf['train']) np.save( model_cnf['np_loss'] + "{0}_cornet_{1}_cornet_dim_{2}.npy".format( model_name, model_cnf['model']['n_cornet_blocks'], model_cnf['model']['cornet_dim']), loss) np.save( model_cnf['np_p1'] + "{0}_cornet_{1}_cornet_dim_{2}.npy".format( model_name, model_cnf['model']['n_cornet_blocks'], model_cnf['model']['cornet_dim']), p1) np.save( model_cnf['np_p5'] + "{0}_cornet_{1}_cornet_dim_{2}.npy".format( model_name, model_cnf['model']['n_cornet_blocks'], model_cnf['model']['cornet_dim']), p5) logger.info('Finish Training') if mode is None or mode == 'eval': logger.info('Loading Test Set') logger.info('model path: ', model_path) mlb = get_mlb(data_cnf['labels_binarizer']) labels_num = len(mlb.classes_) test_x, _ = get_data(data_cnf['test']['texts'], None) logger.info(F'Size of Test Set: {len(test_x)}') logger.info('Predicting') test_loader = DataLoader(MultiLabelDataset(test_x), model_cnf['predict']['batch_size'], num_workers=4) if 'gpipe' not in model_cnf: if model is None: model = Model(network=model_dict[model_name], labels_num=labels_num, model_path=model_path, emb_init=emb_init, **data_cnf['model'], **model_cnf['model']) else: if model is None: model = GPipeModel(model_name, labels_num=labels_num, model_path=model_path, emb_init=emb_init, **data_cnf['model'], **model_cnf['model']) scores, labels = model.predict(test_loader, k=model_cnf['predict'].get('k', 3801)) logger.info('Finish Predicting') labels = mlb.classes_[labels] output_res(data_cnf['output']['res'], F'{model_name}-{data_name}', scores, labels)
def splitting_head_tail_eval( data_cnf, model_cnf, data_name, model_name, model_path, emb_init, tree_id, output_suffix, dry_run, split_ratio, head_labels, tail_labels, head_model, tail_model, ): logger.info('Loading Test Set') mlb = get_mlb(data_cnf['labels_binarizer']) labels_num = len(mlb.classes_) test_x, _ = get_data(data_cnf['test']['texts'], None) logger.info(F'Size of Test Set: {len(test_x):,}') labels_binarizer_path = data_cnf['labels_binarizer'] mlb_h = get_mlb(f"{labels_binarizer_path}_h_{split_ratio}") mlb_t = get_mlb(f"{labels_binarizer_path}_t_{split_ratio}") if head_labels is None: train_x, train_labels = get_data(data_cnf['train']['texts'], data_cnf['train']['labels']) head_labels, _, tail_labels, _ = get_head_tail_labels( train_labels, split_ratio, ) h_labels_i = np.nonzero(mlb.transform(head_labels[None, ...]).toarray())[0] t_labels_i = np.nonzero(mlb.transform(tail_labels[None, ...]).toarray())[0] logger.info('Predicting') if 'cluster' not in model_cnf: test_loader = DataLoader(MultiLabelDataset(test_x), model_cnf['predict']['batch_size'], num_workers=4) if head_model is None: head_model = Model(network=AttentionRNN, labels_num=len(head_labels), model_path=f'{model_path}-head', emb_init=emb_init, load_model=True, **data_cnf['model'], **model_cnf['model']) logger.info('Predicting Head Model') h_k = model_cnf['predict'].get('top_head_k', 30) scores_h, labels_h = head_model.predict(test_loader, k=h_k) labels_h = mlb_h.classes_[labels_h] logger.info('Finish Predicting Head Model') if tail_model is None: tail_model = Model(network=AttentionRNN, labels_num=len(tail_labels), model_path=f'{model_path}-tail', emb_init=emb_init, load_model=True, **data_cnf['model'], **model_cnf['model']) logger.info('Predicting Tail Model') t_k = model_cnf['predict'].get('top_tail_k', 70) scores_t, labels_t = tail_model.predict(test_loader, k=t_k) labels_t = mlb_t.classes_[labels_t] logger.info('Finish Predicting Tail Model') scores = np.c_[scores_h, scores_t] labels = np.c_[labels_h, labels_t] i = np.arange(len(scores))[:, None] j = np.argsort(scores)[:, ::-1] scores = scores[i, j] labels = labels[i, j] else: raise Exception("FastAttention is not currently supported for " "splited head and tail dataset") logger.info('Finish Predicting') score_path, label_path = output_res(data_cnf['output']['res'], f'{model_name}-{data_name}{tree_id}', scores, labels, output_suffix) log_results(score_path, label_path, dry_run)