def main(args): # Read the dataset df = pd.read_csv(args.file) embedder = BertWrapper(args.model_path, max_seq_length=256) pooler = PoolingLayer(embedder.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False, layer_to_use=args.layer) model = SentenceEncoder(modules=[embedder, pooler]) model.eval() evaluator = EmbeddingSimilarityEvaluator( main_similarity=SimilarityFunction.COSINE) if args.t2s: df["text_1"] = df["text_1"].apply(convert_t2s) df["text_2"] = df["text_2"].apply(convert_t2s) tmp = model.encode(df["text_1"].tolist() + df["text_2"].tolist(), batch_size=16, show_progress_bar=True) embeddings1, embeddings2 = tmp[:df.shape[0]], tmp[df.shape[0]:] spearman_score = evaluator(embeddings1, embeddings2, labels=df["similarity"].values) print(spearman_score) preds = 1 - paired_cosine_distances(embeddings1, embeddings2) df["pred"] = preds df.to_csv("cache/annotated_zero_shot_pred.csv", index=False) print(f"Pred {pd.Series(preds).describe()}") return preds, df["similarity"].values
def load_model(model_path: str, linear_transform): model_path_ = Path(model_path) if (model_path_ / "modules.json").exists(): encoder = SentenceEncoder(str(model_path)) encoder[1].pooling_mode_mean_tokens = True encoder[1].pooling_mode_cls_token = False print(encoder[1].get_config_dict()) else: embedder = BertWrapper( model_path, max_seq_length=256 ) pooler = PoolingLayer( embedder.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False, layer_to_use=-1 ) encoder = SentenceEncoder(modules=[ embedder, pooler ]) model = SentencePairCosineSimilarity( encoder, linear_transform=linear_transform ) if linear_transform: model.scaler.data = torch.tensor([0.6]).to(model.encoder.device) model.shift.data = torch.tensor([0.3]).to(model.encoder.device) return model
def main(args): encoder = SentenceEncoder(model_path=args.model_path) encoder.eval() if APEX and args.amp and (not args.torchscript): encoder = amp.initialize(encoder, opt_level=args.amp) if args.torchscript: if args.amp: encoder[0].bert = encoder[0].bert.half() traced_model = torch.jit.trace( encoder[0].bert, (torch.zeros(8, 256).long().cuda(), torch.zeros(8, 256).long().cuda(), torch.ones(8, 256).long().cuda()) ) encoder[0].bert = traced_model assert isinstance(encoder[0].bert, torch.jit.TopLevelTracedModule) encoder.max_seq_length = 256 print(encoder[1].get_config_dict()) encoder[1].pooling_mode_cls_token = False encoder[1].pooling_mode_mean_tokens = True print(encoder[1].get_config_dict()) preds, _ = raw(args, encoder) print(f"Pred {pd.Series(preds).describe()}")
def load_model(model_path, ): embedder = BertWrapper(model_path, max_seq_length=256, do_lower_case=False) pooler = PoolingLayer(embedder.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False, layer_to_use=-1) encoder = SentenceEncoder(modules=[embedder, pooler]) model = SentencePairNliClassification(encoder) return model
def load_model(model_path, linear_transform): embedder = BertWrapper(model_path, max_seq_length=128) pooler = PoolingLayer(embedder.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False, layer_to_use=-1) encoder = SentenceEncoder(modules=[embedder, pooler]) model = SentencePairCosineSimilarity(encoder, linear_transform=linear_transform) return model
def main(args): embedder = BertWrapper(args.model_path, max_seq_length=256) pooler = PoolingLayer(embedder.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False, layer_to_use=args.layer) encoder = SentenceEncoder(modules=[embedder, pooler]) model = SentencePairCosineSimilarity(encoder, linear_transform=False) model.eval() # print("\n".join([name for name, _ in model.named_parameters()])) dataset = LcqmcDataset(embedder.tokenizer, filename=args.filename) loader = DataLoader( dataset, sampler=SortSampler( dataset, key=lambda x: max(len(dataset.text_1[x]), len(dataset.text_2[x]))), collate_fn=partial(collate_pairs, pad=0, opening_id=embedder.cls_token_id, closing_id=embedder.sep_token_id, truncate_length=embedder.max_seq_length), batch_size=16) preds, references = [], [] with torch.no_grad(): for features, labels in tqdm(loader): for name in features: features[name] = features[name].to(encoder.device) preds.append(model(features).cpu().numpy()) references.append(labels.cpu().numpy()) preds = np.concatenate(preds) references = np.concatenate(references) spearman_score = spearmanr(preds, references) print(f"Spearman: {spearman_score.correlation:.4f}") print(f"Pred Min: {np.min(preds)}, {np.max(preds)}") if args.threshold == -1: best_thres, best_acc = -1, -1 for threshold in np.arange(0.05, 1, 0.05): binarized = (preds > threshold).astype("int") acc = (binarized == references).sum() / len(references) if acc > best_acc: best_acc = acc best_thres = threshold print(f"Best acc: {best_acc:.4f} @ {best_thres:.2f}") else: binarized = (preds > args.threshold).astype("int") acc = (binarized == references).sum() / len(references) print(f"Acc: {acc:.4f} @ {args.threshold:.2f}")
def main(model_name, mean_pooling: bool = True, max_pooling: bool = False, cls_token: bool = False, max_seq_length: int = 256, model_type: Optional[str] = None, output_folder: Optional[str] = None): if output_folder is None: output_folder = f"models/{model_name.split('/')[-1]}" print(f"Using the default output folder: {output_folder}") output_path = Path(output_folder) output_path.mkdir(parents=True) embedder = TransformerWrapper(model_name, max_seq_length=max_seq_length, model_type=model_type) pooler = PoolingLayer(embedder.get_word_embedding_dimension(), pooling_mode_mean_tokens=mean_pooling, pooling_mode_cls_token=cls_token, pooling_mode_max_tokens=max_pooling, layer_to_use=-1) encoder = SentenceEncoder(modules=[embedder, pooler]) encoder.save(output_path.resolve())
def load_model(model_path: str, dropout: float, n_classes: int = 4) -> SentenceClassification: embedder = BertWrapper(model_path, max_seq_length=256, do_lower_case=False) pooler = PoolingLayer(embedder.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False, layer_to_use=-1) encoder = SentenceEncoder(modules=[embedder, pooler]) model = SentenceClassification(encoder, n_classes=n_classes, dropout=dropout) return model
def load_encoder(model_path, model_type, max_length, do_lower_case, mean_pooling=True, cls=False, max_pooling=False, expand_to_dimension: int = -1): embedder = TransformerWrapper(model_path, max_seq_length=max_length, do_lower_case=do_lower_case, model_type=model_type) pooler = PoolingLayer(embedder.get_word_embedding_dimension(), pooling_mode_mean_tokens=mean_pooling, pooling_mode_cls_token=cls, pooling_mode_max_tokens=max_pooling, layer_to_use=-1, expand_to_dimension=expand_to_dimension) encoder = SentenceEncoder(modules=[embedder, pooler]) return encoder
def load_model(model_path, model_type, do_lower_case): embedder = TransformerWrapper( model_path, max_seq_length=256, do_lower_case=do_lower_case, model_type=model_type ) pooler = PoolingLayer( embedder.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False, layer_to_use=-1 ) encoder = SentenceEncoder(modules=[ embedder, pooler ]) model = SentencePairCosineSimilarity( encoder, linear_transform=False ) return model
def main(model_path: str = typer.Argument("streamlit-model/")): global MODEL MODEL = SentenceEncoder(model_path, device="cpu").eval() print(f"Listening to port {PORT}") uvicorn.run(APP, host='0.0.0.0', port=PORT)
import typer import uvicorn from opencc import OpenCC from fastapi import FastAPI from pydantic import BaseModel, Field, constr from oggdo.encoder import SentenceEncoder os.environ["TOKENIZERS_PARALLELISM"] = "false" PORT = int(os.environ.get("PORT", "8666")) APP = FastAPI() T2S = OpenCC('t2s') MODEL: Optional[SentenceEncoder] = None if os.environ.get("MODEL", None): MODEL = SentenceEncoder(os.environ["MODEL"], device="cpu").eval() app = APP class TextInput(BaseModel): text: str = Field( None, title="The piece of text you want to create embeddings for.", max_length=384) t2s: bool = False class BatchTextInput(BaseModel): text_batch: List[str] = Field( [], title="Pieces of text you want to create embeddings for.")